def main(): batch_size = 20 wordvec_size = 100 hidden_size = 100 time_size = 35 lr = 20.0 #max_epoch = 4 max_epoch = 1 max_grad = 0.25 corpus, word_to_id, _ = ptb.load_data('train') corpus_test, _, _ = ptb.load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] model = Rnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad, eval_interval=20) model.reset_state() ppl_test = eval_perplexity(model, corpus_test) print(f'test perplexity: {ppl_test}') model.save_params() print('DONE')
def read_data(self, path): corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_test, _, _ = ptb.load_data('test') self.vocab_size = len(word_to_id) self.corpus = corpus self.word_to_id = word_to_id self.id_to_word = id_to_word self.xs = self.corpus[:-1] self.ts = self.corpus[1:]
def main(): batch_size = 20 wordvec_size = 650 hidden_size = 650 time_size = 35 lr = 20.0 #max_epoch = 40 max_epoch = 1 max_grad = 0.25 dropout = 0.5 corpus, word_to_id, _ = ptb.load_data('train') corpus_val, _, _ = ptb.load_data('val') corpus_test, _, _ = ptb.load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) best_ppl = float('inf') for _ in range(max_epoch): trainer.fit(xs, ts, 1, batch_size, time_size, max_grad, eval_interval=20) model.reset_state() ppl = eval_perplexity(model, corpus_val) print(f'valid perplexity: {ppl}') if best_ppl > ppl: best_ppl = ppl model.save_params() else: lr /= 4.0 optimizer.lr = lr model.reset_state() print('-' * 50) model.reset_state() ppl_test = eval_perplexity(model, corpus_test) print(f'valid perplexity: {ppl_test}') print('DONE')
def main() -> None: window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_context_target(corpus, window_size) model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) # trainer.plot() word_vecs = model.word_vecs params = { 'word_vecs': word_vecs.astype(np.float16), 'word_to_id': word_to_id, 'id_to_word': id_to_word } with open('cbow_params.pkl', 'wb') as f: pickle.dump(params, f, -1)
def test_count_method_big(): """PTBコーパスデータセットに対して、 共起行列、正の相互情報量行列、SVDによる次元削減を行う. SVDによる次元削減はO(N^3)なので、高速なscikit-learn版を使う. """ window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurence...') C = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI...') W = ppmi(C, verbose=True) print('calcurating SVD ...') try: # truncated SVD (fast!) from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None) except ImportError: # SVD (slow) U, S, V = np.linalg.svd(W) word_vecs = U[:, :wordvec_size] querys = ['you', 'year' 'car', 'toyota'] for query in querys: most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
def main(): # ハイパーパラメータの設定 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 # データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size) # モデルなどの生成 model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) # 学習開始 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() # 後ほど利用できるように、必要なデータを保存 word_vecs = model.word_vecs params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['word_to_id'] = word_to_id params['id_to_word'] = id_to_word pkl_file = 'cbow_params.pkl' with open(pkl_file, 'wb') as f: pickle.dump(params, f, -1)
def main(): # ハイパーパラメータの設定 batch_size = 20 wordvec_size = 650 hidden_size = 650 time_size = 35 lr = 20.0 max_epoch = 40 max_grad = 0.25 dropout = 0.5 # 学習データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_val, _, _ = ptb.load_data('val') corpus_test, _, _ = ptb.load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) best_ppl = float('inf') for epoch in range(max_epoch): trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size, time_size=time_size, max_grad=max_grad) model.reset_state() ppl = eval_perplexity(model, corpus_val) print('valid perplexity: ', ppl) if best_ppl > ppl: best_ppl = ppl model.save_params() else: lr /= 4.0 optimizer.lr = lr model.reset_state() print('-' * 50)
def main(): # ハイパーパラメータの設定 batch_size = 20 wordvec_size = 100 hidden_size = 100 # RNNの隠れ状態ベクトルの要素数 time_size = 35 # RNNを展開するサイズ lr = 20.0 max_epoch = 4 max_grad = 0.25 # 学習データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_test, _, _ = ptb.load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] # モデルの生成 model = Rnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) # 勾配クリッピングを適用して学習 trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad, eval_interval=20) ''' eval_interval=20 20イテレーションごとにパープレキシティを評価 ''' trainer.plot(ylim=(0, 500)) # テストデータで評価 model.reset_state() ppl_test = eval_perplexity(model, corpus_test) print('test perplexity: ', ppl_test) # パラメータの保存 model.save_params()
def main(): batch_size = 10 wordvec_size = 100 hidden_size = 100 time_size = 5 lr = 0.1 max_epoch = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_size = 1000 corpus = corpus[:1000] vocab_size = int(max(corpus) + 1) xs = corpus[:-1] ts = corpus[1:] data_size = len(xs) print(f'corpus size: {corpus_size}, vocabulary size: {vocab_size}') max_iters = data_size // (batch_size + time_size) time_idx = 0 total_loss = 0 loss_count = 0 ppl_list = [] model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) jump = (corpus_size - 1) // batch_size offsets = [i * jump for i in range(batch_size)] for epoch in range(1, max_epoch + 1): for iter_ in range(max_iters): batch_x = np.empty((batch_size, time_size), dtype=int) batch_t = np.empty((batch_size, time_size), dtype=int) for t in range(time_size): for i, offset in enumerate(offsets): batch_x[i, t] = xs[(offset + time_idx) % data_size] batch_t[i, t] = xs[(offset + time_idx) % data_size] time_idx += 1 loss = model.forward(batch_x, batch_t) model.backward() optimizer.update(model.params, model.grads) total_loss += loss loss_count += 1 ppl = np.exp(total_loss / loss_count) print(f'| epoch {epoch} | perplexity {ppl}') ppl_list.append(float(ppl)) total_loss, loss_count = 0, 0 print('DONE')
def main(word): corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) corpus_size = len(corpus) model = RnnlmGen() # start文字とskip文字の設定 start_word = word start_id = word_to_id[start_word] skip_words = ['N', '<unk>', '$'] skip_ids = [word_to_id[w] for w in skip_words] # 文章生成 word_ids = model.generate(start_id, skip_ids=skip_ids) txt = ' '.join([id_to_word[i] for i in word_ids]) txt = txt.replace(' <eos>', '.\n') print(txt)
def test_show_ptb(): """Penn Treebank(ペン・ツリー・バンク)の一部を表示する. 手頃なサイズのコーパス Word2Vecの作者(Tomas Mikolov)のWebページからダウンロードできる. """ corpus, word_to_id, id_to_word = ptb.load_data('train') print('corpus size: ', len(corpus)) print('corpus[:30]: ', corpus[:30]) print() print('id_to_word[0]: ', id_to_word[0]) print('id_to_word[1]: ', id_to_word[1]) print('id_to_word[2]: ', id_to_word[2]) print() print("word_to_id['car']: ", word_to_id['car']) print("word_to_id['happy']: ", word_to_id['happy']) print("word_to_id['lexus']: ", word_to_id['lexus'])
def main(): corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) corpus_size = len(corpus) model = RnnlmGen() # model.load_params('Rnnlm.pkl') start_word = 'you' start_id = word_to_id[start_word] skip_words = ['N', '<unk>', '$'] skip_ids = [word_to_id[w] for w in skip_words] word_ids = model.generate(start_id, skip_ids) txt = ' '.join([id_to_word[i] for i in word_ids]) txt = txt.replace(' <eos>', '.\n') print(txt) print('DONE')
def main(): curpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) corpus_size = len(curpus) model = RnnlmGen() # model.load_params('../ch06_RNNWithGate/Rnnlm.pkl') # start文字とskip文字の設定 start_word = 'you' start_id = word_to_id[start_word] skip_words = ['N', '<unk>', '$'] skip_ids = [word_to_id[w] for w in skip_words] # 文書生成 word_ids = model.generate(start_id, skip_ids) # サンプルサイズ100 txt = ' '.join([id_to_word[i] for i in word_ids]) txt = txt.replace(' <eos>', '.\n') print(txt)
def main(): window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurence...') co_matrix = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI...') W = ppmi(co_matrix, verbose=True) print('calculating SVD ...') U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=42) word_vecs = U[:, :wordvec_size] queries = ['you', 'year', 'car', 'toyota'] for query in queries: most_similar(query, word_to_id, id_to_word, word_vecs, top=5) print('DONE')
#↑GPU使用時 import pickle from common.trainer import Trainer from common.optimizer import Adam from cbow import CBOW from common.util import create_contexts_target, to_cpu, to_gpu from dataset import ptb #ハイパーパラメータの設定 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 #データの読み込み corpus, wordtoid, idtoword = ptb.load_data('train') vocab_size = len(wordtoid) contexts, target = create_contexts_target(corpus, window_size) if config.GPU: contexts, target = to_gpu(contexts), to_gpu(target) #モデルなどの生成 model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) #学習開始 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot()
import sys sys.path.append("..") import numpy as np from common.util import most_similar, create_co_matrix, ppmi from dataset import ptb window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data("train") vocab_size = len(word_to_id) print("counting co-occurrence ...") C = create_co_matrix(corpus, vocab_size, window_size) print("calculating PPMI ...") W = ppmi(C, verbose=True) print("calculating SVD ...") try: # truncated SVD (fast edition) from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None) except ImportError: # nomal SVD (slow) U, S, V = np.linalg.svd(W) word_vecs = U[:, :wordvec_size] querys = ["you", "year", "car", "toyota", "ferrari"]
# coding: utf-8 import sys sys.path.append('..') from ch06.rnnlm import Rnnlm from ch06.better_rnnlm import BetterRnnlm from dataset import ptb from common.util import eval_perplexity if __name__ == '__main__': model = Rnnlm() #model = BetterRnnlm() # 学習済みのパラメータの読み込み model.load_params() corpus, _, _ = ptb.load_data('test') model.reset_state() ppl_test = eval_perplexity(model, corpus) print('test perplexity: ', ppl_test)
def get_perplexity(model): from common.util import eval_perplexity corpus_val, _, _ = ptb.load_data('val') ppl = eval_perplexity(model, corpus_val) return ppl
from common.trainer import RnnlmTrainer from common.util import eval_perplexity from dataset import ptb from rnnlm import Rnnlm # ハイパーパラメータの設定 batch_size = 20 wordvec_size = 100 hidden_size = 100 time_size = 35 lr = 20 max_epoch = 4 max_grad = 0.25 # 学習データの読み込み corpus, word_to_id, id_to_word = ptb.load_data("train") corpus_test, _, _ = ptb.load_data("test") vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] # モデルの生成 model = Rnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) # 勾配クリッピングの適用 trainer.fit(xs, ts, max_epoch, batch_size,
def main(): # ハイパーパラメータの設定 batch_size = 10 wordvec_size = 100 hidden_size = 500 time_size = 5 # Truncated BPTTの展開する時間サイズ lr = 0.1 max_epoch = 100 # 学習データセットの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_size =1000 corpus = corpus[:corpus_size] vocab_size = int(max(corpus) + 1) print("corpus: ", corpus) print("corpus size: ", len(corpus)) # 入力 xs = corpus[:-1] ts = corpus[1:] # 教師ラベル = 入力の次の単語ID print("corpus size: %d, vocabulary size: %d" % (corpus_size, vocab_size)) data_size = len(xs) print("data_size: ", data_size) # 学習時に使用する変数 max_iters = data_size // (batch_size * time_size) time_idx = 0 total_loss = 0 loss_count =0 ppl_list = [] # モデルの生成 model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) # (1) ミニバッチの各サンプルの読み込み開始位置を計算 jump = (corpus_size - 1) // batch_size # e.g. corpus_size=1000, batch_size=20 -> (1000-1)//20 -> 49 print("jump: ", jump) offsets = [i * jump for i in range(batch_size)] # batch_size=20, jump=49 -> [0, 49, 98, ..., 931] print("offsets: ", offsets) # 学習 for epoch in range(max_epoch): for iter in range(max_iters): # (2) ミニバッチの取得 batch_x = np.empty((batch_size, time_size), dtype=np.int32) batch_t = np.empty((batch_size, time_size), dtype=np.int32) for t in range(time_size): for i, offset in enumerate(offsets): print(f"offset: {offset}, time_idx: {time_idx} -> {(offset + time_idx) % data_size}") batch_x[i, t] = xs[(offset + time_idx) % data_size] batch_t[i, t] = ts[(offset + time_idx) % data_size] time_idx += 1 # 0リセットされないので、iterループ毎にcorpusのoffsetからのズレを変更している. # 勾配を求め、パラメータを更新 loss = model.forward(batch_x, batch_t) model.backward() optimizer.update(model.params, model.grads) total_loss += loss loss_count += 1 # (3) エポック毎にパープレキシティの評価 ppl = np.exp(total_loss / loss_count) print('| epoch %d | perplexity %.2f' % (epoch+1, ppl)) ppl_list.append(float(ppl)) total_loss, loss_count = 0, 0 # グラフの描画 x = np.arange(len(ppl_list)) plt.plot(x, ppl_list, label='train') plt.xlabel('epochs') plt.ylabel('perplexity') plt.show()
import pickle import numpy as np from util import create_contexts_target, most_similar from dataset import ptb from negative_sampling import generate_with_negative_sample tensorboard_callback = TensorBoard(log_dir='logs/cbow') window_size = 10 hidden_size = 100 batch_size = 100 max_epoch = 15 sample_size = 5 corpus, word_to_id, id_to_word = ptb.load_data('train') test_corpus = ptb.load_data('test')[0] vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size) test_contexts, test_target = create_contexts_target(test_corpus, window_size) contexts_input = Input(shape=(window_size * 2, ), name='contexts_input') target_input = Input(shape=(1, ), name='target_input') embed = Embedding(vocab_size, hidden_size, input_length=window_size * 2) contexts_embed = embed(contexts_input) contexts_hidden = Lambda(lambda arr: K.mean(arr, axis=1))(contexts_embed) target_embed = Embedding(vocab_size, hidden_size, input_length=1)(target_input)
import pickle import sys import pandas as pd from ivory.common.context import np from ivory.common.dataset import TimeDataset from ivory.core.trainer import sequential from ivory.utils.repository import repo_directory np.context = "gpu" sys.path.append(repo_directory("scratch2")) from dataset import ptb # isort:skip corpus, _, _ = ptb.load_data("train") corpus_val, _, _ = ptb.load_data("val") vocab_size = int(max(corpus) + 1) x, t = corpus[:-1], corpus[1:] data = TimeDataset((x, t), time_size=35, batch_size=20) x, t = corpus_val[:-1], corpus_val[1:] data_val = TimeDataset((x, t), time_size=35, batch_size=10) print(data) print(data_val) # ハイパーパラメータの設定を行います。 wordvec_size = 650 hidden_size = 650 lr = 20.0 max_grad = 0.25 dropout = 0.5
from common.trainer import RnnlmTrainer from common.util import eval_perplexity from dataset import ptb from rnnlm import Rnnlm # ハイパーパラメータの設定 batch_size = 20 wordvec_size = 100 hidden_size = 100 # RNNの隠れ状態ベクトルの要素数 time_size = 35 # RNNを展開するサイズ lr = 20.0 max_epoch = 4 max_grad = 0.25 # 学習データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_test, _, _ = ptb.load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] # モデルの生成 model = Rnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) # 勾配クリッピングを適用して学習 trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad, eval_interval=20) trainer.plot(ylim=(0, 500))
sys.path.append('..') from common.optimizer import SGD from common.trainer import RnnlmTrainer from dataset import ptb from ch05.simple_rnnlm import SimpleRnnlm # ハイパーパラメータの設定 batch_size = 10 wordvec_size = 100 hidden_size = 100 # RNNの隠れ状態ベクトルの要素数 time_size = 5 # RNNを展開するサイズ lr = 0.1 max_epoch = 100 # 学習データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') corpus_size = 1000 # テスト用にデータセットを小さくする corpus = corpus[:corpus_size] vocab_size = int(max(corpus) + 1) xs = corpus[:-1] # 入力 ts = corpus[1:] # 出力(教師ラベル) # モデルの生成 model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size) optimizer = SGD(lr) trainer = RnnlmTrainer(model, optimizer) trainer.fit(xs, ts, max_epoch, batch_size, time_size) trainer.plot()
def setPTBData(self): corpus, word_to_id, id_to_word = ptb.load_data('train') self.corpus = corpus self.word_to_id = word_to_id self.id_to_word = id_to_word self.n_words = len(word_to_id)