forked from qiaojingy/SNLI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
135 lines (112 loc) · 4.08 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
import numpy as np
import os
WORDVECTORS_PATH = ["/Users/david/Documents/Stanford/SNLI/data/GoogleNews-vectors-negative300.bin", "/juicier/scr100/scr/qiaojing/snli/data/GoogleNews-vectors-negative300.bin"]
WORDVECTOR_DIM = 300
# load model to get word vector
from gensim.models import word2vec
print("Loading word vectors ...")
model = None
for path in WORDVECTORS_PATH:
if os.path.exists(path):
model = word2vec.Word2Vec.load_word2vec_format(path, binary=True)
break
if model == None:
print("Cannot find file for word vectors");
def get_wordvector(sentence):
sentence_wv = []
for word in sentence.split(' '):
try:
sentence_wv.append(model[word.strip(",.!?:;")])
except KeyError:
# For unknown words, we currently just ignore them
pass
return np.asarray(sentence_wv)
import lasagne.init
def word_to_vector(word):
try:
vector = model[word]
except KeyError:
vector = lasagne.init.Uniform(0.05).__call__(WORDVECTOR_DIM)
return vector
def get_initwv_and_mask(vocab):
initwv = []
mask = []
for word in vocab:
try:
vector = model[word]
mask.append(np.zeros(WORDVECTOR_DIM))
except KeyError:
vector = lasagne.init.Uniform(0.05).__call__(WORDVECTOR_DIM)
mask.append(np.ones(WORDVECTOR_DIM))
initwv.append(vector)
initwv = np.asarray(initwv, dtype='float32')
mask = np.asarray(mask, dtype='float32')
return initwv, mask
def remove_unknown_words(data):
for entry in data:
sentence1 = entry['sentence1']
sentence2 = entry['sentence2']
# convert the first letter of sentence to lower case
# currently not implemented
words1 = []
for word in sentence1.split(' '):
try:
t = model[word.strip(",.!?:;")]
words1.append(word)
except KeyError:
# For unknown words, we currently just ignore them
pass
entry['sentence1'] = ' '.join(words1)
words2 = []
for word in sentence2.split(' '):
try:
t = model[word.strip(",.!?:;")]
words2.append(word)
except KeyError:
# For unknown words, we currently just ignore them
pass
entry['sentence2'] = ' '.join(words2)
def process(data):
X_prem = []
X_hypo = []
y = []
label_to_num = {'neutral': 0, 'entailment': 1, 'contradiction': 2, '-': 3}
for entry in data:
sentence1 = entry['sentence1']
sentence2 = entry['sentence2']
gold_label = entry['gold_label']
# convert the first letter of sentence to lower case
# currently not implemented
vector1 = get_wordvector(sentence1)
if (vector1.shape[0] == 0):
continue
vector2 = get_wordvector(sentence2)
if (vector2.shape[0] == 0):
continue
if (gold_label == '-'):
continue
X_prem.append(vector1)
X_hypo.append(vector2)
y.append(label_to_num[gold_label]);
return (X_prem, X_hypo, y)
def get_input_matrices(batch_data):
X_prem, X_hypo, y = process(batch_data)
batch_size = len(X_prem)
# Maximum length of premise sentence
MAX_LENGTH_PREM = max([len(entry) for entry in X_prem])
# Maximum length of hypothesis sentence
MAX_LENGTH_HYPO = max([len(entry) for entry in X_hypo])
# Mask is used in Lasagne LSTM layer
X_prem_mask = np.zeros((batch_size, MAX_LENGTH_PREM))
X_hypo_mask = np.zeros((batch_size, MAX_LENGTH_HYPO))
for i in range(batch_size):
X_prem_mask[i, :len(X_prem[i])] = 1
X_prem[i] = np.pad(X_prem[i], [(0, MAX_LENGTH_PREM - len(X_prem[i])), (0, 0)], 'constant')
for i in range(batch_size):
X_hypo_mask[i, :len(X_hypo[i])] = 1
X_hypo[i] = np.pad(X_hypo[i], [(0, MAX_LENGTH_HYPO - len(X_hypo[i])), (0, 0)], 'constant')
X_prem = np.asarray(X_prem)
X_hypo = np.asarray(X_hypo)
y = np.asarray(y)
return X_prem, X_prem_mask, X_hypo, X_hypo_mask, y