/
loader.py
163 lines (129 loc) · 6.78 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import numpy as np
import random
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.corpora.dictionary import Dictionary
import queue
import threading
from concurrent.futures import ThreadPoolExecutor
import util
class WikiAllData():
def __init__(self, corpus, wiki_dict, wordfile, vocab_size=200000, window_size=5):
self.w2id_dict = util.load_worddict(wordfile, vocab_size)
self.window_size = window_size
print('Starting loading Wiki Corpus...', end='')
wiki_d = Dictionary.load(wiki_dict)
self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d)
print('[done]')
def batch_generator(self, batch_size=128):
QUEUE_END = '__QUEUE_END105834569xx'
def load(q, batch_size):
text_gen = self.wiki_corpus.get_texts()
input_batch = np.zeros(batch_size, dtype=np.int32)
label_batch = np.zeros((batch_size, 1), dtype=np.int32)
counter = 0
#w_vec : list of words in a document
for w_vec in text_gen:
id_vec = [self.w2id_dict[w] if w in self.w2id_dict else -1 for w in w_vec]
init_mid = window = random.randint(2,self.window_size)
# sliding window center go through the w_vec
for mid_idx in range(init_mid, len(id_vec)-window-1):
start_idx = max(0,mid_idx-window)
end_idx = min(mid_idx+window, len(id_vec)-1)
# go through window
for target_idx in range(start_idx, end_idx+1):
if target_idx == mid_idx:
continue
if id_vec[target_idx] == -1:
continue
input_batch[counter] = id_vec[mid_idx]
label_batch[counter] = id_vec[target_idx]
if counter == batch_size - 1:
q.put((input_batch, label_batch))
input_batch = np.zeros(batch_size, dtype=np.int32)
label_batch = np.zeros((batch_size, 1), dtype=np.int32)
counter = 0
else:
counter += 1
window = random.randint(2, self.window_size)
q.put(QUEUE_END)
q = queue.Queue(maxsize=500)
t = threading.Thread(target=load, args=(q, batch_size))
t.daemon = True
t.start()
while True:
q_output = q.get()
if q_output == QUEUE_END:
break
yield q_output
class Wiki9Data():
def __init__(self, corpus, w2id_dict, w2feq_dict, window_size=5):
self.w2id_dict = w2id_dict
self.wid2feq_dict = w2feq_dict
self.window_size = window_size
print('Starting loading Wiki Corpus...', end='')
self.wiki_corpus = self._load_corpus(corpus)
print('[done]')
def _load_corpus(self, corpus):
with open(corpus, 'r') as f:
line = f.readlines()[0].split()
id_vec = [self.w2id_dict[w] if w in self.w2id_dict else -1 for w in line]
return id_vec
def batch_generator(self, model_type='skipgram', batch_size=128):
QUEUE_END = '__QUEUE_END105834569xx'
def load(q, batch_size):
input_batch = np.zeros(batch_size, dtype=np.int32)
label_batch = np.zeros((batch_size, 1), dtype=np.int32)
counter = 0
corpus_len = len(self.wiki_corpus)
window = random.randint(2,self.window_size)
# sliding window center go through the w_vec
idx_queue = [i for i in range(corpus_len)]
random.shuffle(idx_queue)
for mid_idx in idx_queue:
if self.wiki_corpus[mid_idx] == -1:
continue
subsampling_prob = 1. - np.sqrt(0.00001 / self.wid2feq_dict[self.wiki_corpus[mid_idx]])
if random.random() < subsampling_prob:
continue
start_idx = max(0,mid_idx-window)
end_idx = min(mid_idx+window, corpus_len-1)
# go through window
for target_idx in range(start_idx, end_idx+1):
if target_idx == mid_idx:
continue
if self.wiki_corpus[target_idx] == -1:
continue
if model_type == 'skipgram':
input_batch[counter] = self.wiki_corpus[mid_idx]
label_batch[counter] = self.wiki_corpus[target_idx]
elif model_type == 'cbow':
input_batch[counter] = self.wiki_corpus[target_idx]
label_batch[counter] = self.wiki_corpus[mid_idx]
if counter == batch_size - 1:
q.put((input_batch, label_batch))
input_batch = np.zeros(batch_size, dtype=np.int32)
label_batch = np.zeros((batch_size, 1), dtype=np.int32)
counter = 0
else:
counter += 1
window = random.randint(2, self.window_size)
q.put(QUEUE_END)
q = queue.Queue(maxsize=500)
t = threading.Thread(target=load, args=(q, batch_size))
t.daemon = True
t.start()
while True:
q_output = q.get()
if q_output == QUEUE_END:
break
yield q_output
def get_testdata(self):
vec_A = np.array([self.w2id_dict[w] for w in ['king', 'men', 'son', 'great', 'possibly']], dtype=np.int32)
vec_B = np.array([self.w2id_dict[w] for w in ['man', 'man', 'daughter', 'greater', 'impossibly']], dtype=np.int32)
vec_C = np.array([self.w2id_dict[w] for w in ['woman', 'woman', 'granddaughter', 'tougher', 'unethical']], dtype=np.int32)
vec_D = np.array([self.w2id_dict[w] for w in ['queen', 'car', 'grandson', 'tough', 'ethical']], dtype=np.int32)
#vec_A = np.array([self.w2id_dict[w] for w in [b'king', b'men', b'son', b'great', b'possibly']], dtype=np.int32)
#vec_B = np.array([self.w2id_dict[w] for w in [b'man', b'man', b'daughter', b'greater', b'impossibly']], dtype=np.int32)
#vec_C = np.array([self.w2id_dict[w] for w in [b'woman', b'woman', b'granddaughter', b'tougher', b'unethical']], dtype=np.int32)
#vec_D = np.array([self.w2id_dict[w] for w in [b'queen', b'car', b'grandson', b'tough', b'ethical']], dtype=np.int32)
return vec_A, vec_B, vec_C, vec_D