/
testHMM.py
264 lines (219 loc) · 8.7 KB
/
testHMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 15 16:25:43 2015
@author: John
"""
"""
This is a script to test the file hmm.py.
It currently uses the 2005-06 data to train whether each student is in a
knowledgeable or "dunno" state as a hidden state.
Description of HMM states
Start assuming we have only two hidden states: [Dunno, Know]
startP = [.9,.1] respectively
Because we don't know tranisition probabilities, we fake them.
Fake transition probabilities:[[.8,.2],[.01,.99]]
IE assume they can move from Dunno to Know with 20% prob and stay Dunno with
80% prob. And once they Know, they stay at Know with 99% and only go back with
1%.
Again, we fake emission probabilites:
Observable states are: [first correct, corrects > incorrects, otherwise]
say if they Know then first corrects will be [.1, .9] (get it right first try 90%)
and c>i at [.2, .8]
and incorrects as [.1, .9]
Similarly if dunno:
first corrects: [.5, .5]
c>i: [.7, .3]
incorrects: [.5, .5]
"""
import hmm as hmm
import numpy as np
import loader as ld
import tagger as tg
import ID_assigner as ida
import matplotlib.pyplot as plt
time_strings = ['Step Start Time','First Transaction Time',
'Correct Transaction Time','Step End Time']
id_strings = ['Anon Student Id','Problem Name']
def processor(data):
"""
This is a functional form of data_processor sans some features.
Inputs
------
data : string
path and file name of data (e.g. ld.trainDat)
Returns
-------
xy_keys : list
list of keys for data dictionary
dat_array : ndarray
parsed data dictionary now stored in numpy array
tag_master : list
list of tags from Knowledge Component data
tag_array : ndarray
array of knowledge component presence in each question
opp_array : ndarray
array of opportunity count for each component in each question
"""
xy_keys,xy_train = ld.loader(data)
# Process time strings to seconds
for i in range(4):
print 'Processing ' + time_strings[i]
xy_train[time_strings[i]] = ld.convert_times(xy_train[time_strings[i]])
# Convert Step Duration to seconds
xy_train['Step Duration (sec)'] = (xy_train['Step End Time']-
xy_train['Step Start Time'])
# Dictionary of anonId and problem tags
all_dicts = []
# Process string ids
for i in range(2):
print 'Processing ' + id_strings[i]
xy_train[id_strings[i]],temp = ida.ID_assigner(xy_train[id_strings[i]])
all_dicts.append(temp)
xy_train['Problem Hierarchy'],temp,temp2 = ida.unit_ID_assigner(
xy_train['Problem Hierarchy'])
all_dicts.append(temp)
all_dicts.append(temp2)
#These are the variables I care about at the moment, add if want more - JGL
# 'Anon Student Id','Incorrects','Corrects','Problem View',
#'Correct Transaction Time','Correct First Attempt','Step Start Time',
#'First Transaction Time','Problem Hierarchy','Hints','Step End Time']
# KC(Default) and Opportunity(Default) separate arrays.
dat_array = np.empty([datLen,14])
dat_array[:,0] = xy_train['Anon Student Id']
dat_array[:,1] = xy_train['Problem Name']
dat_array[:,2] = xy_train['Problem Hierarchy']
dat_array[:,3] = np.array(xy_train['Incorrects'],dtype=int)
dat_array[:,4] = np.array(xy_train['Hints'],dtype=int)
dat_array[:,5] = np.array(xy_train['Corrects'],dtype=int)
dat_array[:,6] = np.array(xy_train['Correct First Attempt'],dtype=int)
dat_array[:,7] = np.array(xy_train['Problem View'],dtype=int)
dat_array[:,8] = xy_train['Step Start Time']
dat_array[:,9] = xy_train['First Transaction Time']
dat_array[:,10] = xy_train['Correct Transaction Time']
dat_array[:,11] = xy_train['Step End Time']
dat_array[:,12] = xy_train['Step Duration (sec)']
dat_array[:,13] = ld.check_final_answer(xy_train['Step Name'])
# Process Knowledge components
tag_master = tg.string_tags(xy_train['KC(Default)'])
# Process opportunity
tag_array,opp_array = tg.tags_to_array(
xy_train['KC(Default)'],
xy_train['Opportunity(Default)'],
tag_master)
return xy_keys, dat_array, tag_master, tag_array, opp_array
def normalize(dataArray):
length = len(dataArray)
norm = np.zeros(length)
for k in range(dataArray):
if dataArray[k] != 0:
norm[k] = 1
return norm
def smarter(correctData, incorrectData):
"""
Function to process whether a student received more corrects than
incorrects on a particular question.
Inputs
------
correctData : ndarray
correctData column
incorrectData : ndarray
incorrect Data column
Returns
-------
smart : ndarray
1 or 0 depending on whether more corrects than .9*incorrects each step
"""
length = len(correctData)
smart = np.zeros(length)
for k in range(length):
if correctData[k] > .9*incorrectData[k]:
smart[k] = 1
return smart
#The following is just a script from before the milestone to test the forward-
#backward algorithm and its predictions
#Import data
xy_keys, xy_train, tags, tagA, oppA = processor(ld.trainDat)
#relevant scales
tagLen = len(tags)
datLen = len(xy_train[xy_keys[1]])
#Locations of splits by student id
idSplit = [k+1 for k in range(len(xy_train[xy_keys[0]])-1) if
xy_train[xy_keys[1]][k+1]!= xy_train[xy_keys[1]][k]]
#Relevant data for hmm test based on earlier definitions
data = np.zeros([datLen, 5])
data[:,0] = xy_train[xy_keys[1]]
data[:,1] = np.array(xy_train['Correct First Attempt'],dtype=int)
data[:,2] = np.array(xy_train['Hints'],dtype=int)
data[:,3] = np.array(xy_train['Incorrects'],dtype=int)
data[:,4] = np.array(xy_train['Corrects'],dtype=int)
observations = np.zeros(datLen)
#Reprocess corrects into more corrects than incorrects data
data[:,4] = smarter(data[:,4],data[:,3])
#Create observation data states
#[0,1,2] = [otherwise, c>i, correct]
for k in range(datLen):
if data[k,4]>0:
observations[k] += 1
if data[k,1]>0:
observations[k] += 1
#Number of students
numStud = len(idSplit)+1
def hmm_tester(x, start, trans, emit):
"""
This is a function to test the forward-backward algorithm in hmm.py. Splits
by student and runs f-b on all steps up to n-1, then compares prediction to
nth step
Inputs
------
x : ndarray
training observation data, nx1
start : ndarray
starting probabilities, kx1
trans : ndarray
transition probabilities, kxk
emit : ndarray
emission probabilities, kxd
Returns
-------
rmse : ndarray
array of root-mean-square-error on prediction of first correct on next
question compared to actual next data point result. This is currently
not using the test data.
"""
#Initialize array of predictions, probability of correct on next question
predicts = np.zeros(numStud)
#Initialize array for rmse to compare to actual test data
rmse = np.zeros(numStud)
#Run forward-backward on first student
f,b,probF,probB,post = hmm.frwd_bkwd(observations[:idSplit[0]-1],
startP,transP,emitP)
#Predict and compute error on first student
predicts[0] = np.dot(emitP[:,2],np.dot(transP,post[-1]))
rmse[0] = np.sqrt((data[idSplit[0]-1,1]-predicts[0])**2)
#Run forward-backward on last student
f,b,probF,probB,post = hmm.frwd_bkwd(observations[idSplit[-1]:-1],
startP,transP,emitP)
#Predict and compute error on last student
predicts[-1] = np.dot(emitP[:,2],np.dot(transP,post[-1]))
rmse[-1] = np.sqrt((data[-1,1]-predicts[-1])**2)
#Run fwd-bkwd, predict, and compute error on remaining students
for k in range(numStud-2):
f,b,probF,probB,post = hmm.frwd_bkwd(
observations[idSplit[k]:idSplit[k+1]-1],
startP,transP,emitP)
predicts[k] = np.dot(emitP[:,2],np.dot(transP,post[-1]))
rmse[k] = np.sqrt((data[idSplit[k]-1,1]-predicts[k])**2)
return rmse
"""
error = np.zeros(20)
for k in range(20):
p = k*.005 +.85
#startP = [p, 1-p]
startP = np.array([.99, .01])
transP = np.array([[p,1-p],[.01,.99]])
emitP = np.array([[.7,.2,.1],[.1,.3,.6]])
rmse = hmm_tester(observations, startP, transP, emitP)
predictions = [rmse[l] for l in range(numStud) if not np.isnan(rmse[l])]
error[k] = 1-np.average(predictions)
plt.scatter(np.arange(20),error)
"""