def eval_perplexity(model, corpus, batch_size=10, time_size=35): print('evaluating perplexity ...') corpus_size = len(corpus) total_loss = 0 max_iters = (corpus_size - 1) // (batch_size * time_size) jump = (corpus_size - 1) // batch_size for iters in range(max_iters): xs = np.zeros((batch_size, time_size), dtype=np.int32) ts = np.zeros((batch_size, time_size), dtype=np.int32) time_offset = iters * time_size offsets = [time_offset + (i * jump) for i in range(batch_size)] for t in range(time_size): for i, offset in enumerate(offsets): xs[i, t] = corpus[(offset + t) % corpus_size] ts[i, t] = corpus[(offset + t + 1) % corpus_size] try: loss = model.forward(xs, ts, train_flg=False) except TypeError: loss = model.forward(xs, ts) total_loss += loss sys.stdout.write('\r%d / %d' % (iters, max_iters)) sys.stdout.flush() print('') ppl = np.exp(total_loss / max_iters) return ppl
def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # レイヤの生成 self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # すべての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5): '''類似単語の検索 :param query: クエリ(テキスト) :param word_to_id: 単語から単語IDへのディクショナリ :param id_to_word: 単語IDから単語へのディクショナリ :param word_matrix: 単語ベクトルをまとめた行列。各行に対応する単語のベクトルが格納されていることを想定する :param top: 上位何位まで表示するか ''' # クエリを取り出す if query not in word_to_id: print('%s is not found' % query) return print('\n[query] ' + query) query_id = word_to_id[query] query_vec = word_matrix[query_id] # コサイン類似度の算出 vocab_size = len(id_to_word) similarity = np.zeros(vocab_size) for i in range(vocab_size): similarity[i] = cos_similarity(word_matrix[i], query_vec) # コサイン類似度の結果から、その値を高い順に出力 count = 0 for i in (-1 * similarity).argsort(): if id_to_word[i] == query: continue print(' %s: %s' % (id_to_word[i], similarity[i])) count += 1 if count >= top: return
def create_co_matrix(corpus, vocab_size, window_size=1): '''共起行列の作成 :param corpus: コーパス(単語IDのリスト) :param vocab_size:語彙数 :param window_size:ウィンドウサイズ(ウィンドウサイズが1のときは、単語の左右1単語がコンテキスト) :return: 共起行列 ''' corpus_size = len(corpus) co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32) for idx, word_id in enumerate(corpus): for i in range(1, window_size + 1): left_idx = idx - i right_idx = idx + i if left_idx >= 0: left_word_id = corpus[left_idx] co_matrix[word_id, left_word_id] += 1 if right_idx < corpus_size: right_word_id = corpus[right_idx] co_matrix[word_id, right_word_id] += 1 return co_matrix
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads
def forward(self, xs): Wx, Wh, b = self.params N, T, _ = xs.shape H = Wh.shape[0] self.layers = [] hs = np.empty((N, T, H), dtype='f') if (not self.stateful) or (self.h is None): self.h = np.zeros((N, H), dtype='f') if (not self.stateful) or (self.c is None): self.c = np.zeros((N, H), dtype='f') for t in range(T): layer = LSTM(Wx, Wh, b) self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c) hs[:, t, :] = self.h self.layers.append(layer) return hs
def convert_one_hot(corpus, vocab_size): '''one-hot表現への変換 :param corpus: 単語IDのリスト(1次元もしくは2次元のNumPy配列) :param vocab_size: 語彙数 :return: one-hot表現(2次元もしくは3次元のNumPy配列) ''' N = corpus.shape[0] if corpus.ndim == 1: one_hot = np.zeros((N, vocab_size), dtype=np.int32) for idx, word_id in enumerate(corpus): one_hot[idx, word_id] = 1 elif corpus.ndim == 2: C = corpus.shape[1] one_hot = np.zeros((N, C, vocab_size), dtype=np.int32) for idx_0, word_ids in enumerate(corpus): for idx_1, word_id in enumerate(word_ids): one_hot[idx_0, idx_1, word_id] = 1 return one_hot
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None
def forward(self, h, target): batch_size = target.shape[0] negative_sample = self.sampler.get_negative_sample(target) # 正例のフォワード score = self.embed_dot_layers[0].forward(h, target) correct_label = np.ones(batch_size, dtype=np.int32) loss = self.loss_layers[0].forward(score, correct_label) # 負例のフォワード negative_label = np.zeros(batch_size, dtype=np.int32) for i in range(self.sample_size): negative_target = negative_sample[:, i] score = self.embed_dot_layers[1+i].forward(h, negative_target) loss += self.loss_layers[1+i].forward(score, negative_label) return loss
def __init__(self, corpus, power, sample_size): self.sample_size = sample_size self.vocab_size = None self.word_p = None counts = collections.Counter() for word_id in corpus: counts[word_id] += 1 vocab_size = len(counts) self.vocab_size = vocab_size self.word_p = np.zeros(vocab_size) for i in range(vocab_size): self.word_p[i] = counts[i] self.word_p = np.power(self.word_p, power) self.word_p /= np.sum(self.word_p)
def get_negative_sample(self, target): batch_size = target.shape[0] if not GPU: negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32) for i in range(batch_size): p = self.word_p.copy() target_idx = target[i] p[target_idx] = 0 p /= p.sum() negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p) else: # GPU(cupy)で計算するときは、速度を優先 # 負例にターゲットが含まれるケースがある negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size), replace=True, p=self.word_p) return negative_sample