forked from vatika/Automated-Essay-Grading
-
Notifications
You must be signed in to change notification settings - Fork 0
/
graph_diffusion.py
346 lines (328 loc) · 14.7 KB
/
graph_diffusion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# Copyright 2015 Abhijeet Kumar ,Anurag Ghosh, Vatika Harlalka
# Graph Diffusion Techniques
import warnings
with warnings.catch_warnings():
import csv
import numpy as np
import scipy
import scipy.linalg
from scipy.sparse import csgraph
from scipy.spatial.distance import pdist, squareform
from sklearn.cross_validation import KFold
from sklearn.metrics.pairwise import chi2_kernel, rbf_kernel
import weighted_kappa as own_wp
from sklearn.lda import LDA
# similarity measures
# Remember --> similarity is inversely proportional to distance measure
def gaussian_kernel(X):
"""
Input:
X --> The feature set of essays, of size N * D
Output:
exp(-|x - y|**2) for all x and y belonging to rows(X)
This is the guassian similarity measure.
"""
s = 100
pairwise_dists = -1 * squareform(pdist(X, 'euclidean'))**2
return scipy.exp(pairwise_dists / s ** 2)
def linear_kernel(X):
"""
Input:
X --> The feature set of essays, of size N * D
Output:
1/|x - y| for all x and y belonging to rows(X)
This is the linear similarity measure.
Note:
All output[i,j] equal to NaN and inf are set to 0.0
"""
pairwise_dists = 1/squareform(pdist(X, 'euclidean'))
nan_idx = np.isnan(pairwise_dists)
pairwise_dists[pairwise_dists == -np.inf] = 0.0
pairwise_dists[pairwise_dists == +np.inf] = 0.0
pairwise_dists[nan_idx] = 0.0
return pairwise_dists
class graph_diffusion():
"""
Weak Supervision Method to classify using spectral graph
analysis on a transducive setup and parameterized similarity
measure between two essays.
"""
def __init__(self,range_min,range_max,similarity_measure,neighbourhood="exponential"):
"""
Input:
range_min --> minimum range of marks awarded
range_max --> maximum range of marks awarded
similarity_measure --> the kernel that is to be used.
Preferably a one-one mapping.
neighbourhood --> can be either "exponential" or "average" or "stochastic"
neighbourhood means that the scaling used in diffusion
follows the specified curve and method.
Output:
None
Note:
"average" <- Takes the average of all the predicted values.
"exponential" <- A weighted average of the values is taken with weights
as e^-i where i is the i'th iteration of the heat matrix.
"stochastic" <- The value is chosen among the values with a
probability of e^-i where i is the i'th iteration
of the heat matrix.
"""
self.range_min = range_min
self.range_max = range_max
self.similarity_measure = similarity_measure
self.neighbourhood = neighbourhood
def calculate_degree_matrix(self,W):
"""
Input:
W --> The similarity measure of the points pairwise taken.
Can be interpreted as a graph.
Output:
D --> The degree matrix of the graph W.
Note:
None
"""
return np.diag(sum(W.T))
#make the similarity matrix sparisified
#reasons::
#manifold learning
#SVD computation
def sparsify(self,W,k,sparse_type):
"""
Input:
W---> complete adjacency matrix calclulated using some similarity measure
k---> numner of nearest neighbors to consider
sparse_type---> k-nearest neighbor(0) or mutual k nerest neighbour(1) or completely_connected(2)(no sparsification)
Ouput:
S---> sparisified graph using k-nearest neighbors
i.e maximum k values in each rows are kept and rest reduced to zero
as accepted this will resuklt in directed version of the graph
to circumvent this we use :
k nearest neighbour :: connect if edge is in top k neighbor of any connecting node
mutual k nearest neighbour :: connect if edge is in top k neighbor of both the connecting nodes
"""
if sparse_type == 2:
return W
shape = np.shape(W)
S = np.zeros(shape)
T = np.fliplr(np.sort(W,1))
for i in xrange(0,shape[0]):
for j in xrange(0,k):
temp = T[i,j]
locations = np.where(W[i]==temp)
for m in xrange(0,len(locations)):
loc = locations[m]
S[i,loc[0]] = temp
if sparse_type == 0:
S[loc[0],i] = temp
if sparse_type == 0:
return S
#nmutual k nearest neighbor not working :: something wrong coding
else:
M = (S + S.T) /2
for i in xrange(0,shape[0]):
for j in xrange(0,shape[1]):
if M[i,j] != S[j,i]:
S[i,j] =0
if (S==S.T).all():
print "haan"
print S
return S
# graph adjacenvy matrix formulation
def formulate_graph_laplacian(self,X):
"""
Input:
X --> The feature set of essays, of size N * D
Output:
L --> Laplacian Matrix of the graph formed by the essay set X.
We form the graph W from X by computing the similarity
between x and y for all x and y belonging to X
Then we find the degree matrix D of W, which is a diagonal
matrix.
Laplacian L is defined as
L = D - W
Normalized Laplacian is defined as
l_norm = D^(-1/2)*L*D^(-1/2)
Normalized Laplacian are known to work marginally better
than in graph diffusion.
D --> The degree matrix D of W
Note:
None
"""
W = self.similarity_measure(X)
W = self.sparsify(W,100,0)
D = self.calculate_degree_matrix(W)
return csgraph.laplacian(W, normed=True), D
def train(self,x_train,x_test,y_train):
"""
Input:
x_train --> The training samples.
x_test --> The testing samples.
y_train --> The ground truth of the training samples.
Output:
E_val --> The eigenvalues of the generalized eigen equation
of the form
L*X = (lambda)D*X
where L is the graph laplacian and D is the
Degree Matrix of the graph formed by the points in
both the training set and the testing set.
E_vec --> The eigenvectors in the aformentioned eigen equation.
Note:
Y --> n*l(no of categories) matrix. A column has values 1, -1, 0
for present, not present and unknown.
Remark - The formulation is transducive,
ie. the training set and the test
set is known at the time of training.
Impl. Remark - scipy.linalg.eigh is used instead of
scipy.linalg.eig because D is diagonal matrix
"""
self.test_size = len(x_test)
self.train_size = len(y_train)
self.dim = self.range_max - self.range_min + 1
self.Y = np.zeros((self.train_size + self.test_size, self.dim))
rng = [ val for val in xrange(0, self.dim) ]
for itx in xrange(0, len(y_train)):
self.Y[itx, rng] = -1
self.Y[itx,int(y_train[itx])-range_min] = 1 # Subtract to compensate for non zero range start (refer self.predict)
self.X = np.concatenate((x_train,x_test),0)
self.L,self.D = self.formulate_graph_laplacian(self.X)
[self.E_val,self.E_vec_U] = scipy.linalg.eigh(self.L,self.D)
def exp_neighbourhood(self,Z):
"""
Input:
Z --> The heat matrix at different time scales stored row wise.
Output:
Weighted average of all the predicted values with weights
as e^-i where i is the i'th iteration of the heat matrix.
"""
ans = np.zeros(self.test_size)
weighted_denom = 0
for i in xrange(0,itr):
weighted_denom += scipy.exp(-1*i)
ans += scipy.exp(-1*i)*Z[:,i]
return np.round(ans/weighted_denom)
def average_neighbourhood(self,Z):
"""
Input:
Z --> The heat matrix at different time scales stored row wise.
Output:
Average of all the predicted values.
"""
return np.round(sum(Z.T)/itr)
def stochastic_neighbourhood(self, Z):
"""
Input:
Z --> The heat matrix at different time scales stored row wise.
Output:
Randomly chosen value from set of predicted values (in a column)
with probability as e^-i where i is the i'th iteration
of the heat matrix.
Note:
Should converge to "exp_voting" method given enough iterations.
"""
weights = [scipy.exp(-1*i) for i in xrange(0,self.itr)]
weights = weights/sum(weights)
ans = np.zeros(self.test_size)
for i in xrange(0,self.test_size):
ans[i] = np.random.choice(Z[i,:], p=weights)
return ans
def predict(self):
"""
Input: (indirect)
x_test --> The testing samples. However, as the setup is
transducive, we need x_test during training.
Output:
Voted values of prediction over the various heat matrix of
different times as defined by
t = k*(a^i) with k and a as consts, and i as iteration.
and heat matrix is defined as
H(t) = E_vec*exp(-1*E_val*t)*transpose(E_vec)
where E_vec and E_val are the eigenvalues & eigenvectors of the
generalized eigen equation of the form L*X = (lambda)D*X with
L being the graph laplacian and D being the degree matrix.
Then the diffused values are computed by
Y(t) = H(t)*Y(0)
where Y(0) is the n*l(no of categories) matrix. A column has values 1, -1, 0
for present, not present and unknown.
Find the maximum over a row of Y(t) to find the classification wrt to Y(t).
Now, use neighbourhood diffusion to find the final classification estimate
from Y(t) for all t, for the x_test, ie. y_pred.
Note:
Maximum voting is used here to predict.
Small t implies local diffusion.
Large t implies global diffusion.
"""
self.itr = 5
Z = np.zeros((self.test_size,self.itr))
for i in xrange(0,self.itr):
t = 0.000000001*(100**i)
temp = scipy.exp(-self.E_val*t)
H1 = np.dot(np.dot(self.E_vec_U,np.diag(temp)),self.E_vec_U.T)
Y1 = np.dot(H1,self.Y)
Z1 = np.zeros(self.test_size)
for j in xrange(self.train_size,len(Y1)): # Only the unlabeled (x_test) is labeled
max_ind = np.argmax(Y1[j,:])
Z1[j - self.train_size] = max_ind + self.range_min # Sum to compensate for non zero range start (refer self.train)
Z[:,i] = Z1
if self.neighbourhood == "exponential":
return self.exp_neighbourhood(Z)
elif self.neighbourhood == "average":
return self.average_neighbourhood(Z)
elif self.neighbourhood == "stochastic":
return self.stochastic_neighbourhood(Z)
else:
raise BaseException("Unsupported Voting Measure")
class k_fold_cross_validation(object):
def __init__(self,k,stat_class,x_train,y_train,range_min,range_max,similarity_measure):
self.k_cross = float(k)
self.stat_class = stat_class
self.x_train = x_train
self.y_train = y_train
self.values = []
self.range_min = range_min
self.range_max = range_max
self.similarity_measure = similarity_measure
def execute(self):
kf = KFold(len(self.x_train), n_folds=self.k_cross)
own_kappa = []
for train_idx, test_idx in kf:
x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
stat_obj = self.stat_class(range_min=range_min,range_max=range_max, \
similarity_measure=self.similarity_measure, \
neighbourhood="stochastic") # reflection bitches
stat_obj.train(x_train,x_test,y_train)
y_pred = np.matrix(stat_obj.predict()).T
cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,\
self.range_min,self.range_max)
self.values.append(cohen_kappa_rating)
return str(sum(self.values)/self.k_cross)
if __name__ == "__main__":
for i in [1,3,4,5,6]: #to change after feature extraction done for all sets
# training data
train_data = []
with open('./Data/features_'+str(i)+'.csv','r') as in_file:
csv_content = list(csv.reader(in_file,delimiter=','))
for row in csv_content:
train_data.append(row)
header = train_data[0]
train_data = train_data[1:] #clip the header
train_data = np.matrix(train_data,dtype='float64')
Y_train = train_data[:,2].copy() #actual_values
X_train = train_data[:,2:].copy() #actual_data with random bias units
m = np.size(X_train,axis=0)
X_train[:,0] = np.ones((m,1)) #bias units modified
cross_valid_k = 5
range_max = range_min = 0
if i == 1:
range_min = 2
range_max = 12
elif i == 3 or i == 4:
range_max = 3
elif i == 5 or i == 6:
range_max = 4
#dim_red = LDA()
#X_train = dim_red.fit_transform(X_train, Y_train)
diffusion_k_cross = k_fold_cross_validation(cross_valid_k, \
graph_diffusion, X_train,Y_train, \
range_min,range_max,gaussian_kernel)
print diffusion_k_cross.execute()