Exemple #1
0
def main():
    batch_size = 20
    wordvec_size = 100
    hidden_size = 100
    time_size = 35
    lr = 20.0
    #max_epoch = 4
    max_epoch = 1
    max_grad = 0.25

    corpus, word_to_id, _ = ptb.load_data('train')
    corpus_test, _, _ = ptb.load_data('test')
    vocab_size = len(word_to_id)
    xs = corpus[:-1]
    ts = corpus[1:]

    model = Rnnlm(vocab_size, wordvec_size, hidden_size)
    optimizer = SGD(lr)
    trainer = RnnlmTrainer(model, optimizer)

    trainer.fit(xs,
                ts,
                max_epoch,
                batch_size,
                time_size,
                max_grad,
                eval_interval=20)
    model.reset_state()

    ppl_test = eval_perplexity(model, corpus_test)
    print(f'test perplexity: {ppl_test}')

    model.save_params()
    print('DONE')
Exemple #2
0
 def read_data(self, path):
     corpus, word_to_id, id_to_word = ptb.load_data('train')
     corpus_test, _, _ = ptb.load_data('test')
     self.vocab_size = len(word_to_id)
     self.corpus = corpus
     self.word_to_id = word_to_id
     self.id_to_word = id_to_word
     self.xs = self.corpus[:-1]
     self.ts = self.corpus[1:]
def main():
    batch_size = 20
    wordvec_size = 650
    hidden_size = 650
    time_size = 35
    lr = 20.0
    #max_epoch = 40
    max_epoch = 1
    max_grad = 0.25
    dropout = 0.5

    corpus, word_to_id, _ = ptb.load_data('train')
    corpus_val, _, _ = ptb.load_data('val')
    corpus_test, _, _ = ptb.load_data('test')
    vocab_size = len(word_to_id)
    xs = corpus[:-1]
    ts = corpus[1:]

    model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
    optimizer = SGD(lr)
    trainer = RnnlmTrainer(model, optimizer)

    best_ppl = float('inf')
    for _ in range(max_epoch):
        trainer.fit(xs,
                    ts,
                    1,
                    batch_size,
                    time_size,
                    max_grad,
                    eval_interval=20)
        model.reset_state()

        ppl = eval_perplexity(model, corpus_val)
        print(f'valid perplexity: {ppl}')

        if best_ppl > ppl:
            best_ppl = ppl
            model.save_params()
        else:
            lr /= 4.0
            optimizer.lr = lr
        model.reset_state()
        print('-' * 50)

    model.reset_state()
    ppl_test = eval_perplexity(model, corpus_test)
    print(f'valid perplexity: {ppl_test}')
    print('DONE')
Exemple #4
0
def main() -> None:
    window_size = 5
    hidden_size = 100
    batch_size = 100
    max_epoch = 10

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    contexts, target = create_context_target(corpus, window_size)

    model = CBOW(vocab_size, hidden_size, window_size, corpus)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    trainer.fit(contexts, target, max_epoch, batch_size)
    # trainer.plot()

    word_vecs = model.word_vecs
    params = {
        'word_vecs': word_vecs.astype(np.float16),
        'word_to_id': word_to_id,
        'id_to_word': id_to_word
    }
    with open('cbow_params.pkl', 'wb') as f:
        pickle.dump(params, f, -1)
Exemple #5
0
def test_count_method_big():
    """PTBコーパスデータセットに対して、
    共起行列、正の相互情報量行列、SVDによる次元削減を行う.

    SVDによる次元削減はO(N^3)なので、高速なscikit-learn版を使う.
    """
    window_size = 2
    wordvec_size = 100

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)
    print('counting co-occurence...')
    C = create_co_matrix(corpus, vocab_size, window_size)
    print('calculating PPMI...')
    W = ppmi(C, verbose=True)

    print('calcurating SVD ...')
    try:
        # truncated SVD (fast!)
        from sklearn.utils.extmath import randomized_svd
        U, S, V = randomized_svd(W,
                                 n_components=wordvec_size,
                                 n_iter=5,
                                 random_state=None)
    except ImportError:
        # SVD (slow)
        U, S, V = np.linalg.svd(W)

    word_vecs = U[:, :wordvec_size]

    querys = ['you', 'year' 'car', 'toyota']
    for query in querys:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
Exemple #6
0
def main():
    # ハイパーパラメータの設定
    window_size = 5
    hidden_size = 100
    batch_size = 100
    max_epoch = 10

    # データの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    contexts, target = create_contexts_target(corpus, window_size)

    # モデルなどの生成
    model = CBOW(vocab_size, hidden_size, window_size, corpus)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    # 学習開始
    trainer.fit(contexts, target, max_epoch, batch_size)
    trainer.plot()

    # 後ほど利用できるように、必要なデータを保存
    word_vecs = model.word_vecs

    params = {}
    params['word_vecs'] = word_vecs.astype(np.float16)
    params['word_to_id'] = word_to_id
    params['id_to_word'] = id_to_word
    pkl_file = 'cbow_params.pkl'
    with open(pkl_file, 'wb') as f:
        pickle.dump(params, f, -1)
Exemple #7
0
def main():
    # ハイパーパラメータの設定
    batch_size = 20
    wordvec_size = 650
    hidden_size = 650
    time_size = 35
    lr = 20.0
    max_epoch = 40
    max_grad = 0.25
    dropout = 0.5

    # 学習データの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    corpus_val, _, _ = ptb.load_data('val')
    corpus_test, _, _ = ptb.load_data('test')

    vocab_size = len(word_to_id)
    xs = corpus[:-1]
    ts = corpus[1:]

    model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
    optimizer = SGD(lr)
    trainer = RnnlmTrainer(model, optimizer)

    best_ppl = float('inf')
    for epoch in range(max_epoch):
        trainer.fit(xs,
                    ts,
                    max_epoch=1,
                    batch_size=batch_size,
                    time_size=time_size,
                    max_grad=max_grad)

        model.reset_state()
        ppl = eval_perplexity(model, corpus_val)
        print('valid perplexity: ', ppl)

        if best_ppl > ppl:
            best_ppl = ppl
            model.save_params()
        else:
            lr /= 4.0
            optimizer.lr = lr

        model.reset_state()
        print('-' * 50)
Exemple #8
0
def main():
    # ハイパーパラメータの設定
    batch_size = 20
    wordvec_size = 100
    hidden_size = 100  # RNNの隠れ状態ベクトルの要素数
    time_size = 35  # RNNを展開するサイズ
    lr = 20.0
    max_epoch = 4
    max_grad = 0.25

    # 学習データの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    corpus_test, _, _ = ptb.load_data('test')
    vocab_size = len(word_to_id)
    xs = corpus[:-1]
    ts = corpus[1:]

    # モデルの生成
    model = Rnnlm(vocab_size, wordvec_size, hidden_size)
    optimizer = SGD(lr)
    trainer = RnnlmTrainer(model, optimizer)

    # 勾配クリッピングを適用して学習
    trainer.fit(xs,
                ts,
                max_epoch,
                batch_size,
                time_size,
                max_grad,
                eval_interval=20)
    '''
    eval_interval=20
    20イテレーションごとにパープレキシティを評価
    '''
    trainer.plot(ylim=(0, 500))

    # テストデータで評価
    model.reset_state()
    ppl_test = eval_perplexity(model, corpus_test)
    print('test perplexity: ', ppl_test)

    # パラメータの保存
    model.save_params()
Exemple #9
0
def main():
    batch_size = 10
    wordvec_size = 100
    hidden_size = 100
    time_size = 5
    lr = 0.1
    max_epoch = 100

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    corpus_size = 1000
    corpus = corpus[:1000]
    vocab_size = int(max(corpus) + 1)

    xs = corpus[:-1]
    ts = corpus[1:]
    data_size = len(xs)
    print(f'corpus size: {corpus_size}, vocabulary size: {vocab_size}')

    max_iters = data_size // (batch_size + time_size)
    time_idx = 0
    total_loss = 0
    loss_count = 0
    ppl_list = []

    model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size)
    optimizer = SGD(lr)

    jump = (corpus_size - 1) // batch_size
    offsets = [i * jump for i in range(batch_size)]

    for epoch in range(1, max_epoch + 1):
        for iter_ in range(max_iters):
            batch_x = np.empty((batch_size, time_size), dtype=int)
            batch_t = np.empty((batch_size, time_size), dtype=int)
            for t in range(time_size):
                for i, offset in enumerate(offsets):
                    batch_x[i, t] = xs[(offset + time_idx) % data_size]
                    batch_t[i, t] = xs[(offset + time_idx) % data_size]
                time_idx += 1

            loss = model.forward(batch_x, batch_t)
            model.backward()
            optimizer.update(model.params, model.grads)
            total_loss += loss
            loss_count += 1

        ppl = np.exp(total_loss / loss_count)
        print(f'| epoch {epoch} | perplexity {ppl}')
        ppl_list.append(float(ppl))
        total_loss, loss_count = 0, 0
    print('DONE')
Exemple #10
0
def main(word):
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)
    corpus_size = len(corpus)

    model = RnnlmGen()

    # start文字とskip文字の設定
    start_word = word
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]

    # 文章生成
    word_ids = model.generate(start_id, skip_ids=skip_ids)
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    print(txt)
def test_show_ptb():
    """Penn Treebank(ペン・ツリー・バンク)の一部を表示する.
    手頃なサイズのコーパス
    Word2Vecの作者(Tomas Mikolov)のWebページからダウンロードできる.
    """

    corpus, word_to_id, id_to_word = ptb.load_data('train')

    print('corpus size: ', len(corpus))
    print('corpus[:30]: ', corpus[:30])
    print()
    print('id_to_word[0]: ', id_to_word[0])
    print('id_to_word[1]: ', id_to_word[1])
    print('id_to_word[2]: ', id_to_word[2])
    print()
    print("word_to_id['car']: ", word_to_id['car'])
    print("word_to_id['happy']: ", word_to_id['happy'])
    print("word_to_id['lexus']: ", word_to_id['lexus'])
Exemple #12
0
def main():
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)
    corpus_size = len(corpus)

    model = RnnlmGen()
    # model.load_params('Rnnlm.pkl')

    start_word = 'you'
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]

    word_ids = model.generate(start_id, skip_ids)
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    print(txt)
    print('DONE')
def main():
    curpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)
    corpus_size = len(curpus)

    model = RnnlmGen()
    # model.load_params('../ch06_RNNWithGate/Rnnlm.pkl')

    # start文字とskip文字の設定
    start_word = 'you'
    start_id = word_to_id[start_word]
    skip_words = ['N', '<unk>', '$']
    skip_ids = [word_to_id[w] for w in skip_words]

    # 文書生成
    word_ids = model.generate(start_id, skip_ids)  # サンプルサイズ100
    txt = ' '.join([id_to_word[i] for i in word_ids])
    txt = txt.replace(' <eos>', '.\n')
    print(txt)
Exemple #14
0
def main():
    window_size = 2
    wordvec_size = 100

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    print('counting co-occurence...')
    co_matrix = create_co_matrix(corpus, vocab_size, window_size)
    print('calculating PPMI...')
    W = ppmi(co_matrix, verbose=True)

    print('calculating SVD ...')
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=42)

    word_vecs = U[:, :wordvec_size]

    queries = ['you', 'year', 'car', 'toyota']
    for query in queries:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
    print('DONE')
Exemple #15
0
#↑GPU使用時
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from cbow import CBOW
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb

#ハイパーパラメータの設定
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

#データの読み込み
corpus, wordtoid, idtoword = ptb.load_data('train')
vocab_size = len(wordtoid)

contexts, target = create_contexts_target(corpus, window_size)
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

#モデルなどの生成
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

#学習開始
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
Exemple #16
0
import sys
sys.path.append("..")
import numpy as np
from common.util import most_similar, create_co_matrix, ppmi
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data("train")
vocab_size = len(word_to_id)
print("counting co-occurrence ...")
C = create_co_matrix(corpus, vocab_size, window_size)
print("calculating PPMI ...")
W = ppmi(C, verbose=True)

print("calculating SVD ...")
try:
    # truncated SVD (fast edition)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=None)
except ImportError:
    # nomal SVD (slow)
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]

querys = ["you", "year", "car", "toyota", "ferrari"]
Exemple #17
0
# coding: utf-8
import sys
sys.path.append('..')
from ch06.rnnlm import Rnnlm
from ch06.better_rnnlm import BetterRnnlm
from dataset import ptb
from common.util import eval_perplexity


if __name__ == '__main__':
    model = Rnnlm()
    #model = BetterRnnlm()

    # 学習済みのパラメータの読み込み
    model.load_params()

    corpus, _, _ = ptb.load_data('test')

    model.reset_state()
    ppl_test = eval_perplexity(model, corpus)
    print('test perplexity: ', ppl_test)
Exemple #18
0
def get_perplexity(model):
    from common.util import eval_perplexity
    corpus_val, _, _ = ptb.load_data('val')
    ppl = eval_perplexity(model, corpus_val)
    return ppl
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity
from dataset import ptb
from rnnlm import Rnnlm

# ハイパーパラメータの設定
batch_size = 20
wordvec_size = 100
hidden_size = 100
time_size = 35
lr = 20
max_epoch = 4
max_grad = 0.25

# 学習データの読み込み
corpus, word_to_id, id_to_word = ptb.load_data("train")
corpus_test, _, _ = ptb.load_data("test")
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

# モデルの生成
model = Rnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

# 勾配クリッピングの適用
trainer.fit(xs,
            ts,
            max_epoch,
            batch_size,
def main():
    # ハイパーパラメータの設定
    batch_size = 10
    wordvec_size  = 100
    hidden_size  = 500
    time_size = 5 # Truncated BPTTの展開する時間サイズ
    lr = 0.1
    max_epoch = 100

    # 学習データセットの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    corpus_size  =1000
    corpus = corpus[:corpus_size]
    vocab_size = int(max(corpus) + 1)

    print("corpus: ", corpus)
    print("corpus size: ", len(corpus))

    # 入力
    xs = corpus[:-1]
    ts = corpus[1:] # 教師ラベル = 入力の次の単語ID
    print("corpus size: %d, vocabulary size: %d" % (corpus_size, vocab_size))
    data_size = len(xs)
    print("data_size: ", data_size)

    # 学習時に使用する変数
    max_iters = data_size // (batch_size * time_size)
    time_idx = 0
    total_loss = 0
    loss_count  =0
    ppl_list = []

    # モデルの生成
    model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size)
    optimizer = SGD(lr)

    # (1) ミニバッチの各サンプルの読み込み開始位置を計算
    jump = (corpus_size - 1) // batch_size # e.g. corpus_size=1000, batch_size=20 -> (1000-1)//20 -> 49
    print("jump: ", jump)
    offsets = [i * jump for i in range(batch_size)] # batch_size=20, jump=49 -> [0, 49, 98, ..., 931]
    print("offsets: ", offsets)

    # 学習
    for epoch in range(max_epoch):
        for iter in range(max_iters):

            # (2) ミニバッチの取得
            batch_x = np.empty((batch_size, time_size), dtype=np.int32)
            batch_t = np.empty((batch_size, time_size), dtype=np.int32)
            for t in range(time_size):
                for i, offset in enumerate(offsets):
                    print(f"offset: {offset}, time_idx: {time_idx} -> {(offset + time_idx) % data_size}")
                    batch_x[i, t] = xs[(offset + time_idx) % data_size]
                    batch_t[i, t] = ts[(offset + time_idx) % data_size]
                time_idx += 1 # 0リセットされないので、iterループ毎にcorpusのoffsetからのズレを変更している.
            
            # 勾配を求め、パラメータを更新
            loss = model.forward(batch_x, batch_t)
            model.backward()
            optimizer.update(model.params, model.grads)
            total_loss += loss
            loss_count += 1

        # (3) エポック毎にパープレキシティの評価
        ppl = np.exp(total_loss / loss_count)
        print('| epoch %d | perplexity %.2f' % (epoch+1, ppl))
        ppl_list.append(float(ppl))
        total_loss, loss_count = 0, 0

    # グラフの描画
    x = np.arange(len(ppl_list))
    plt.plot(x, ppl_list, label='train')
    plt.xlabel('epochs')
    plt.ylabel('perplexity')
    plt.show()     
Exemple #21
0
import pickle
import numpy as np

from util import create_contexts_target, most_similar
from dataset import ptb
from negative_sampling import generate_with_negative_sample

tensorboard_callback = TensorBoard(log_dir='logs/cbow')

window_size = 10
hidden_size = 100
batch_size = 100
max_epoch = 15
sample_size = 5

corpus, word_to_id, id_to_word = ptb.load_data('train')
test_corpus = ptb.load_data('test')[0]

vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
test_contexts, test_target = create_contexts_target(test_corpus, window_size)

contexts_input = Input(shape=(window_size * 2, ), name='contexts_input')
target_input = Input(shape=(1, ), name='target_input')

embed = Embedding(vocab_size, hidden_size, input_length=window_size * 2)

contexts_embed = embed(contexts_input)
contexts_hidden = Lambda(lambda arr: K.mean(arr, axis=1))(contexts_embed)

target_embed = Embedding(vocab_size, hidden_size, input_length=1)(target_input)
import pickle
import sys

import pandas as pd

from ivory.common.context import np
from ivory.common.dataset import TimeDataset
from ivory.core.trainer import sequential
from ivory.utils.repository import repo_directory

np.context = "gpu"

sys.path.append(repo_directory("scratch2"))
from dataset import ptb  # isort:skip

corpus, _, _ = ptb.load_data("train")
corpus_val, _, _ = ptb.load_data("val")
vocab_size = int(max(corpus) + 1)
x, t = corpus[:-1], corpus[1:]
data = TimeDataset((x, t), time_size=35, batch_size=20)
x, t = corpus_val[:-1], corpus_val[1:]
data_val = TimeDataset((x, t), time_size=35, batch_size=10)
print(data)
print(data_val)

# ハイパーパラメータの設定を行います。
wordvec_size = 650
hidden_size = 650
lr = 20.0
max_grad = 0.25
dropout = 0.5
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity
from dataset import ptb
from rnnlm import Rnnlm

# ハイパーパラメータの設定
batch_size = 20
wordvec_size = 100
hidden_size = 100  # RNNの隠れ状態ベクトルの要素数
time_size = 35  # RNNを展開するサイズ
lr = 20.0
max_epoch = 4
max_grad = 0.25

# 学習データの読み込み
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_test, _, _ = ptb.load_data('test')
vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

# モデルの生成
model = Rnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

# 勾配クリッピングを適用して学習
trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad,
            eval_interval=20)
trainer.plot(ylim=(0, 500))
Exemple #24
0
sys.path.append('..')
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from dataset import ptb
from ch05.simple_rnnlm import SimpleRnnlm


# ハイパーパラメータの設定
batch_size = 10
wordvec_size = 100
hidden_size = 100  # RNNの隠れ状態ベクトルの要素数
time_size = 5  # RNNを展開するサイズ
lr = 0.1
max_epoch = 100

# 学習データの読み込み
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_size = 1000  # テスト用にデータセットを小さくする
corpus = corpus[:corpus_size]
vocab_size = int(max(corpus) + 1)
xs = corpus[:-1]  # 入力
ts = corpus[1:]  # 出力(教師ラベル)

# モデルの生成
model = SimpleRnnlm(vocab_size, wordvec_size, hidden_size)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

trainer.fit(xs, ts, max_epoch, batch_size, time_size)
trainer.plot()
Exemple #25
0
 def setPTBData(self):
     corpus, word_to_id, id_to_word = ptb.load_data('train')
     self.corpus = corpus
     self.word_to_id = word_to_id
     self.id_to_word = id_to_word
     self.n_words = len(word_to_id)