-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
177 lines (161 loc) · 8.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from lshash import LSHash
#from rawTweets_parser import processfile
from text import onlineTfidfVectorizer
import json
import os
import numpy as np
import scipy.sparse as sp
import cProfile
import time
def run():
initial = True
size = 2000
tweet_ids = []
tweet_text = []
counter = 0
num_hashtables = 13 ## recompute the random vectors if this is changed
dimension = 50000 ## recompute the random vectors if this is changed
hash_size = 13 ## length of the LSHash of the tweets
bucket_size = 100 ## size of the queue for each hash in the hash tables
comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
cos_threshold = .7 ## threshold for the similarity of two tweets
## initialize the tf-idf vectorizer
vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)
clusters = {} ## maintain the clusters
num_clusters = 0
inv_index = {} ## inverse mapping from tweet_id to clusters
Y = None
Y1 = None
f_d = open("output.txt",'w')
loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/"
for root, dirs, filenames in os.walk(loc):
for f in filenames:
with open(loc+f) as infile:
for line in infile:
## load 2000 tweets at a time
tweet = json.loads(line)
tweet_ids.append(tweet['id'])
tweet_text.append(tweet['text'])
counter = counter + 1
t2 = 0
if counter%size == 0:
t1 = time.clock()
## X contains te tf-idf score of the tweets in the "sparse row matrix" format
if initial:
X = vectorizer.fit_transform(tweet_text)
else:
X = vectorizer.transform(tweet_text)
print X.get_shape()
print len(vectorizer.vocabulary_)
## if the total number of keywords exceed the pre-specified dimension, raise error
if X.get_shape()[0] > dimension:
print X.get_shape()
print "dimension exceeded"
raise
for i in range(X.get_shape()[0]):
temp_tweet = X.getrow(i)
## query for the nearest neighbor from the lshash tables
nn = lsh.arpoxNN(temp_tweet, L=comparisons)
c = 2
scase = False
## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
if nn is not None:
((a, b),c) = nn
if c <= cos_threshold:
inv_index[tweet_ids[i]] = inv_index[b]
clusters.setdefault(inv_index[b],[]).append(tweet_ids[i])
#else:
# scase = True
## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
""" code to linearly search through the tweets"""
if (c > cos_threshold or nn is None or scase):
searchY = False
if (i==0 and not initial):
searchY = True
if (i==0 and initial):
inv_index[tweet_ids[i]] = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
if (i!=0):
Z = X[:i]
#print temp_tweet.shape
t2 = temp_tweet.transpose()
#print i
a1 = Z.dot(t2).toarray()
a2 = Z.multiply(Z).sum(axis = 1)
a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
a2 = sp.csc_matrix(a2).toarray()
b = [j for j in range(Z.shape[0])]
a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
#a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet))
#print a
t3 = tweet_ids[a]
if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold:
if not initial and i != size-1:
searchY = True
else:
inv_index[tweet_ids[i]] = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
else:
inv_index[tweet_ids[i]] = inv_index[t3]
clusters.setdefault(inv_index[t3], []).append(tweet_ids[i])
if searchY == True:
Z = Y[i:]
t2 = temp_tweet.transpose()
#print i
a1 = Z.dot(t2).toarray()
a2 = Z.multiply(Z).sum(axis = 1)
a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
a2 = sp.csc_matrix(a2).toarray()
b1 = [j for j in range(Z.shape[0])]
a = min(b1, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
t3 = Y1[a + i]
if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))< cos_threshold:
inv_index[tweet_ids[i]] = inv_index[t3]
else:
inv_index[tweet_ids[i]] = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
### index the tweet into the hsh tables
lsh.index(input_point = temp_tweet, extra_data = tweet_ids[i])
initial = False
Y = X
Y1 = tweet_ids[:]
tweet_ids = []
tweet_text = []
print counter
print time.clock() - t1
f2 = open('time.txt','a')
f2.write(str(time.clock()-t1) + '\n')
f2.close()
if counter%10000==0:
f2 = open('result.txt', 'a')
f2.write(json.dumps(clusters) + "\n")
f3 = open('vocab.txt', 'a')
f4 = open('vectorizer.txt', 'a')
f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
f4.write(json.dumps(vectorizer.idf_) + "\n")
#print clusters
#print vectorizer.vocabulary_
f2.close()
f3.close()
f4.close()
f2 = open('result.txt', 'w')
f2.write(json.dumps(clusters) + "\n")
f3 = open('vocab.txt', 'w')
f4 = open('vectorizer.txt', 'w')
f5 = open('inv_index.txt', 'w')
f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
f4.write(json.dumps(vectorizer.idf_) + "\n")
f5.write(json.dumps(inv_index))
#print clusters
#print vectorizer.vocabulary_
f2.close()
f3.close()
f4.close()
f5.close()
run()
#cProfile.run('run()')