forked from killix/RNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_test.py
148 lines (125 loc) · 4.35 KB
/
run_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
__author__ = 'iankuoli'
import numpy
import time
import sys
import subprocess
import os
import random
import functools
import gensim
import re
import logging
import theano
import load
import RNNModel
from accuracy import conlleval
from tools import shuffle, minibatch, contextwin
if __name__ == '__main__':
s = {'fold': 3, # 5 folds 0,1,2,3,4
'lr': 0.07, #0.0627142536696559,
'verbose': 1,
'decay': False, # decay on the learning rate if improvement stops
'win': 7, # number of words in the context window
'bs': 9, # number of backprop through time steps
'nhidden':1000, # number of hidden units
'seed': 1976,
'emb_dimension': 200, # dimension of word embedding
'nepochs': 50}
folder = 'run'
# model feature dim has 200
model = gensim.models.Word2Vec.load_word2vec_format('vectors.bin', binary=True)
# load the training dataset
train_data = list()
test_data = list()
# parse each question's 5 possible answers `[...]` from test data
setTestLabels = set()
f_test = open('testing.txt', 'r')
extract_test_answer_option = re.compile(r'\[(\w+)\]').search
for line in f_test:
# match_obj = re.search(r"\[.*\]", line)
lines = line.strip('\n')[4:].split(' ')
for word in lines:
if word.isalnum():
setTestLabels.add(word)
match_obj = extract_test_answer_option(line)
if match_obj:
label_word = match_obj.group(1)
setTestLabels.add(label_word)
# label:
# labelindx:
# vec:
label2vec = dict()
vec2label = dict()
labelindx2word = dict()
word2labelindx = dict()
label_indx = 0
# add test labels as word
# TODO: make the test labels stable (sort?)
for label in setTestLabels:
labelindx2word[label_indx] = label
word2labelindx[label] = label_indx
label_indx += 1
# start word
labelindx2word[label_indx] = "<s>"
word2labelindx["<s>"] = label_indx
# start word
labelindx2word[label_indx+1] = "</s>"
word2labelindx["</s>"] = label_indx+1
# other word
labelindx2word[label_indx+2] = "XXXXXX"
word2labelindx["XXXXX"] = label_indx+2
vocsize = len(model.vocab)
nclasses = len(labelindx2word)
# instanciate the model
numpy.random.seed(s['seed'])
random.seed(s['seed'])
rnn = RNNModel.RNNModel(nh=s['nhidden'],
nc=nclasses,
ne=vocsize,
de=s['emb_dimension'],
cs=7)
rnn.load(folder)
# read in test file
test_questions = []
with open('testing_data.cleaned.txt') as f:
for answer_option in f:
test_questions.append(
list(answer_option.rstrip().split(' '))
)
# loop by question
test_question_probs = []
for answer_option in test_questions:
x_fvec = []
labels = []
termss = []
for term in answer_option:
# convert word to feature vector
if term in model:
x_fvec.append(model[term])
else:
# for instance: 'good-humoured' ==> 'good'
try:
x_fvec.append(model[term.split('-')[0]])
except KeyError:
# same hack as in the training process
x_fvec.append(model['some'])
# map word to label_index
if term in word2labelindx:
# is label
labels.append(word2labelindx[term])
else:
# not a label
labels.append(word2labelindx["XXXXX"])
termss.append(term)
# add a PADDING-END word at the rightend (the last word in the sentence)
labels.append(word2labelindx["</s>"])
# remove a PADDING_START word at the begining.
labels.pop(0)
cwords = contextwin(x_fvec, s['win'], model["<s>"], model["</s>"])
prediction_test = rnn.test(numpy.asarray(cwords).astype('float32'))
prediction_test2 = []
for term_indx in range(len(labels)):
prediction_test2.append(prediction_test[term_indx, labels[term_indx]])
#test_question_probs.append(prediction_test)
test_question_probs.append(prediction_test2)
print(','.join([str(p) for p in prediction_test2]))