forked from icantrell/Natural-Language-Processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp.py
479 lines (350 loc) · 20 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
import numpy as np
from copy import deepcopy
import re
import pickle
from functools import reduce
from scipy import random
_PARSE_REGEX = 'Mrs\.|Mr\.|Ms\.|\.|;|:|\?|!|\'|\w+|\d+|,|"'
def parse(corpus):
'''Parse the document into its words.'''
return [w.lower() for w in re.findall(_PARSE_REGEX,corpus)]
def get_sentences(corpus,n=1):
'''Split parsed text into groups of n sentences.'''
#corpus = parse(corpus)
paragraph = []
sentences = []
sentence = []
regex = re.compile('\.|!|\?')
i=0
for word in corpus:
sentence.append(word)
if regex.match(word):
sentences+=sentence.copy()
sentence = []
i+=1
if i%n==0:
paragraph.append(sentences.copy())
sentences= []
return paragraph
class hmm:
'''class hmm
This class is for a hidden markov model.
--------------------------------------
DOC
--------------------------------------
numpy.array
2x2 numpy.dict
emisson matrix
columns are states
0,1,2,3 ... COLUMNS ARE numpy.array int16
---------
a0='word0' |
a1 ='word1' |
|
... |
|
ROWS ARE DICT ENTRIES
2x2 numpy.array
transition matrix
Columns are the outgoing states.
, .... 0 1 2 3 ... COLUMNS ARE numpy.array dtype=int16
-------------
0 |
1 |
2 |
... |
ROWS ARE numpy.array dtype=int16
Rows are the incoming states.
'''
def __init__(self):
'''
#Test parameters from paper.
self.transition_matrix = np.array([[.3,.7],[.1,.9]])
self.initial_matrix = np.array([.85,.15])
self.emission_matrix = {'A':np.array([.4,.5]),'B':np.array([.6,.5])}
self.number_of_states = 2
'''
#initialize empty containers
self.transition_matrix = np.empty([0])
self.initial_matrix = np.empty([0])
self.emission_matrix = {}
self.number_of_states = 0
self.state_tag_map = np.array([])
def randomize(self,observations,num_states):
'''
initialize an hmm with n hidden states and m observations for each state
and andomize all transitions.
'''
self.number_of_states = num_states
#The probabilites for each state's outoging transitions sum to 1.
self.transition_matrix = np.matrix(random.dirichlet(np.ones((num_states)),size=num_states))
self.transition_matrix.dtype = np.float64
#The initial probabilities of starting with a state all sum to 1.
self.initial_matrix = np.matrix(random.dirichlet(np.ones((num_states))))
self.initial_matrix.dtype = np.float64
#The emission probabilities for each state sum to 1.
emission_probs = np.matrix(random.dirichlet(np.ones(len(observations)), num_states))
for o in observations:
self.emission_matrix[o] = np.matrix(np.zeros(num_states))
self.emission_matrix[o].dtype = np.float64
#set values of emission matrix
for i,o in enumerate(observations):
for x in range(num_states):
self.emission_matrix[o][0,x] = emission_probs[x,i]
def _forward_algorithm(self,arr):
#test if prefix of the array was already computed?
forward_array = np.matrix(np.zeros([len(arr),len(self.transition_matrix)],dtype=np.float64))
#set intial probablities
#for each state, x, multiply the the probability of the first word being emitted times the probability of x being the initial state.
t0 = self.emission_matrix[arr[0]]
t1 = self.initial_matrix
forward_array[0]=np.multiply(t0, t1)
#starting at the second word in the array
for index,entry in enumerate(arr[1:]):
#for each state, x, take the summation of the fi*ex*tix i=[0:n] where fi is the ith value computed last iteration
#for the ith state and ex is probability of the word("entry") is emitted for the xth state and tix is the incoming
#transition from the ith state to the xth.
#numpy's will broadcast for each of the states.
forward_array[index + 1] = np.multiply(self.emission_matrix[entry].T, self.transition_matrix.T * forward_array[index].T).T
#forward_array[index+1] = (forward_array[index] * self.emission_matrix[entry].reshape((self.number_of_states,1)) * self.transition_matrix.T).sum(1)
#return the matrix
return forward_array
def _backward_algorithm(self,arr):
#reverse the array
arr = arr[::-1]
#set the initial probabilities to 1. For the transitions to the final states.
backward_array = np.matrix(np.full([len(arr), len(self.transition_matrix)],1,dtype=np.float64))
for index,entry in enumerate(arr[:-1]):
#for each state, x, take the summation of the fi*ei*tix i=[0:n] where fi is the ith value computed last iteration
#for the ith state and ei is probability of the word("entry") is emitted for the ith state and tix is the outgoing
#transition from the xth state to the ith. This part is done by going backward through the directed graph.
#numpy's will broadcast for each of the states.
backward_array[index+1] = (self.transition_matrix* np.multiply(self.emission_matrix[entry].T , backward_array[index].T)).T
#return the matrix
return backward_array[::-1]
def viterbi(self,arr):
#USE ADDITIVE LOG.
#set aside space for a transition matrix to hold the values for the
transition_array = np.empty((len(arr),len(self.transition_matrix)),dtype=np.float64)
backtrack_array = np.empty((len(arr) - 1,len(self.transition_matrix)),dtype=np.float64)
#set the initial transiton matrix
if arr[0] in self.emission_matrix:
transition_array[0] = np.array(self.initial_matrix) * np.array(self.emission_matrix[arr[0]])
else:
print('"' + arr[0] +'" is not recognized.')
return []
#for each element in the input find and record the last most likely state.
for index,entry in list(enumerate(arr))[1:]:
if arr[index] in self.emission_matrix:
for n_index in range(len((self.transition_matrix))):
#get the incoming values by multiplying by their transition probablities.
incoming_values = transition_array[index-1]*np.array(self.transition_matrix).T[n_index]
#find the index of the most likely.
l_index = np.argmax(incoming_values)
#get the probability this state output the current element.
a = len(self.emission_matrix[arr[index]])
transition_array[index][n_index] = np.array(self.emission_matrix[arr[index]])[0][n_index] * incoming_values[l_index]
#record last value.
backtrack_array[index-1][n_index] = l_index
else:
print('"' + arr[index] +'" is not recognized.')
return []
#set aside space to build the output sequence.
output_sequence = np.empty((len(arr)),dtype=np.int64)
#find the most likely final state.
output_sequence[0] = np.argmax(transition_array[-1])
for i,x in zip(range(1,len(output_sequence)),range(len(backtrack_array))[::-1]):
#go backwards down the backtrack getting each value.
output_sequence[i] = backtrack_array[x][output_sequence[i-1]]
#return the sequence.
if len(self.state_tag_map) == len(self.transition_matrix):
return self.state_tag_map[output_sequence[::-1]]
#return the reverse of the array
return output_sequence[::-1]
def _probability_of_transitions(self, f_array, b_array, probability_of_arr, arr):
#allocate space for the probability of taking a transition from i at at time t to j at time t for j,i =[0,num_states],[0,num_states-1].
prob_of_transitions = np.zeros((len(arr),len(self.transition_matrix),len(self.transition_matrix)),dtype = np.float64)
for t in range(len(arr)-1):
prob_of_transitions[t] = np.multiply(np.multiply(np.multiply( f_array[t].T , self.transition_matrix).T , self.emission_matrix[arr[t+1]].T), b_array[t+1].T).T / probability_of_arr
return prob_of_transitions
def _probability_of_states(self, prob_of_transitions, f_array, probability_of_arr,arr):
#using the transition probabilities sum the columns along the j axis for each row to make the probabilities for state i
#allocate space for the probability of being at state i at time t.
prob_of_states = np.zeros([len(arr),len(self.transition_matrix)],dtype = np.float64)
prob_of_states = prob_of_transitions.sum(2)
#set state probabilities at last time to the results of the forward array divided by the words current probability
#Fixed: makes a prob dist.
prob_of_states[-1] = f_array[-1] / probability_of_arr
return prob_of_states
def __epoch():
pass
def baum_welch(self,arrs):
''' input: arr (type: numpy.array)
optimizes hidden markov model to recognize a given sequence.
'''
#set the current and last probabilities for first iteration(do while loop).
current_probs = np.full((len(arrs)),0.0000000001,dtype=np.float64)
last_probs = np.zeros([len(arrs)],dtype=np.float64)
#while probabilities of observations are still rising.
while current_probs.sum() > last_probs.sum():
#copy last probability
last_probs = current_probs.copy()
not_used = 0
#initialize temporary matricies. Set all to zero for summations amongst different observations sequences.
initial_m = np.matrix(np.zeros(self.initial_matrix.shape,dtype=np.float64))
prob_of_transitions_m = np.matrix(np.zeros(self.transition_matrix.shape,dtype=np.float64))
emission_m = deepcopy(self.emission_matrix)
prob_of_states_m = np.matrix(np.zeros(len(self.transition_matrix),dtype=np.float64))
prob_of_states_except_last_m = np.matrix(np.zeros(len(self.transition_matrix),dtype=np.float64))
for e in emission_m:
emission_m[e][:] = 0
#for each observation sequence
for i,arr in enumerate(arrs):
if int(i%(len(arrs)*0.1)) == 0:
print(i/float(len(arrs)))
#run forward and backward algorithm and store results.
f_array = self._forward_algorithm(arr)
b_array = self._backward_algorithm(arr)
c = np.multiply(f_array,b_array).sum(1)
#get the probability of this observation sequence occuring.
probability_of_arr = f_array[-1].sum()
#set this words current probability
current_probs[i] = probability_of_arr
if probability_of_arr != 0.0:
prob_of_transitions = self._probability_of_transitions(f_array, b_array, probability_of_arr, arr)
prob_of_states = self._probability_of_states(prob_of_transitions, f_array, probability_of_arr, arr)
#set initial probabilities
initial_m += prob_of_states[0]
#sum the states probs. along the time axis.(To get the average)
prob_of_states_m += prob_of_states.sum(0)
prob_of_states_except_last_m += prob_of_states[:-1].sum(0)
#sum the transitions probs. along the time axis.(To get the average)
prob_of_transitions_m += prob_of_transitions.sum(0)
#!
t = len(prob_of_transitions)
s= prob_of_transitions.sum(2).sum(1)
d = prob_of_transitions[-1]
a= prob_of_transitions.sum(2)
b=a.sum()
c=b/len(self.transition_matrix)
#must be a slow spot at around 5000 words and 1000 sentences.
#for each possible observation by the hmm.
for entry in self.emission_matrix:
#works as indicator function
boolean_array = np.equal(np.array(arr,dtype=np.object),entry)
#broadcast over time axis
emission_m[entry] += (prob_of_states*boolean_array.reshape(len(arr),1)).sum(0)
else:
print('sentence '+str(i)+' with value \"' + ' '.join(arr) + '\" has probability of 0 and was not counted in this iteration.')
not_used += 1
print('current probs:')
print(current_probs.sum())
print(current_probs.mean())
print('last probs:')
print(last_probs.sum())
print(last_probs.mean())
#divide the initial probabilites by there sum so that the new sum is 1.
initial_m = initial_m/(len(arrs ) - not_used)
b=initial_m.sum()#ERROR ON LAST ITERATION?
if not np.isclose(b, 1.0):
print('here')
#do the same for each state and it's outgoing probabilities.
prob_of_transitions_m = prob_of_transitions_m/prob_of_states_except_last_m.T
a= prob_of_transitions_m.sum(1)
c=np.matrix(np.zeros(len(self.transition_matrix)))
for e in emission_m:
emission_m[e] = emission_m[e]/prob_of_states_m
c += emission_m[e]
#now update the hmm's parameters.
self.initial_matrix = initial_m
self.transition_matrix = prob_of_transitions_m
self.emission_matrix = emission_m
return
def __roulette(self,vec):
#vec is a probability distribution
assert(np.isclose(vec.sum(), 1.0))
r = random.random()
i = 0
t = 0
while( t < r):
t += vec[i]
i+=1
return i - 1
def talk(self, n=1):
words = [w for w in self.emission_matrix]
state_dists = np.array([np.array(dist)[0] for dist in self.emission_matrix.values()]).T
for _ in range(n):
state = self.__roulette(np.array(self.initial_matrix)[0])
output = words[self.__roulette(state_dists[state])]
while(True):
print((output, state))
if output == '.' or output == '?' or output == '!':
break
state = self.__roulette(np.array(self.transition_matrix[state])[0])
output = words[self.__roulette(state_dists[state])]
def save_to_file(self,filename):
'''save hmm parameters to a file.'''
f = open(filename,'wb')
pickle.dump(self.__dict__,f,2)
f.close()
def load_from_file(self,filename):
'''Load hmm parameters from a file and set this hmm's parameters to them.'''
f = open(filename,'rb')
self.__dict__.update(pickle.load(f))
f.close()
def map_internal_states(self,tagged_sequences):
'''Maps the numeric hidden states to meaningful tags given a small amount of
tagged data.
the key_tags parameter is used to force hidden states to be exclusive
to tags in key_tags.Also states will only return those tags instead of
multiple tags.'''
tagged_words = reduce((lambda x,y:x + y),tagged_sequences)
tag_indicies = list(set([t for w,t in tagged_words]))
if len(tag_indicies) != len(self.transition_matrix):
print('The number of tags should equal the number of states.')
return []
self.state_tag_map = []
state_tag_matrix = np.zeros([len(self.transition_matrix)]*2, dtype = np.int32)
for sequence in tagged_sequences:
our_tags = self.viterbi([w for w,t in sequence])
if np.any(our_tags):
for i in range(len(sequence)):
tag = sequence[i][1]
state_tag_matrix[our_tags[i],tag_indicies.index(tag)] += 1
state_tag = self._gale_shapley(np.argsort(-state_tag_matrix.T),np.argsort(-state_tag_matrix))
self.state_tag_map = np.array([tag_indicies[tag_index] for tag_index in state_tag])
def _gale_shapley(self,matp,mata):
'''given two matrices this function solve the stable marriage problem.'''
if matp.shape == mata.shape and matp.shape[0] == matp.shape[1]:
#males who are not married
proposers_not_married = np.full(len(matp),fill_value=True, dtype=np.bool)
#array for acceptor's best proposers so far
accepted_rank = np.full(len(matp),fill_value=len(matp),dtype = np.int32)
proposed_rank = np.full(len(matp),fill_value=len(matp),dtype = np.int32)
proposer_marriages = np.full(len(matp),fill_value=-1,dtype = np.int32)
acceptor_marriages = np.full(len(matp),fill_value=-1,dtype = np.int32)
while(proposers_not_married.any()):
for p in proposers_not_married.nonzero()[0]:
for current_acceptor_rank,a in enumerate(matp[p]):
if current_acceptor_rank > accepted_rank[p]:
break
for current_proposer_rank,desired_proposer in enumerate(mata[a]):
if current_proposer_rank > proposed_rank[a]:
break
if desired_proposer == p:
old_marriage = acceptor_marriages[a]
proposer_marriages[p] = a
acceptor_marriages[a] = p
proposers_not_married[p] = False
accepted_rank[p] = current_acceptor_rank
proposed_rank[a] = current_proposer_rank
if old_marriage != -1:
proposers_not_married[old_marriage] = True
accepted_rank[old_marriage] = len(matp)
proposer_marriages[old_marriage] = -1
return proposer_marriages
return []
#references:
#http://www.indiana.edu/~iulg/moss/hmmcalculations.pdf
#https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm
#https://people.eecs.berkeley.edu/~stephentu/writeups/hmm-baum-welch-derivation.pdf