Ejemplo n.º 1
0
def main():
    # ハイパーパラメータの設定
    window_size = 5
    hidden_size = 100
    batch_size = 100
    max_epoch = 10

    # データの読み込み
    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    contexts, target = create_contexts_target(corpus, window_size)

    # モデルなどの生成
    model = CBOW(vocab_size, hidden_size, window_size, corpus)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    # 学習開始
    trainer.fit(contexts, target, max_epoch, batch_size)
    trainer.plot()

    # 後ほど利用できるように、必要なデータを保存
    word_vecs = model.word_vecs

    params = {}
    params['word_vecs'] = word_vecs.astype(np.float16)
    params['word_to_id'] = word_to_id
    params['id_to_word'] = id_to_word
    pkl_file = 'cbow_params.pkl'
    with open(pkl_file, 'wb') as f:
        pickle.dump(params, f, -1)
    def test_convert_one_hot_contexts(self):
        text = 'you say goodbye and I say hello.'
        corpus, w2id, id2w = preprocess(text)
        contexts, target = create_contexts_target(corpus, 1)

        contexts = convert_one_hot(contexts, len(w2id))

        self.assertEqual(contexts.shape, (6, 2, 7))
    def test_create_contexts_target(self):
        text = 'You say goodbye and I say hello.'
        corpus, word_to_id, id_to_word = preprocess(text)
        contexts, target = create_contexts_target(corpus, window_size=1)

        expected_contexts = np.array([[0, 2], [1, 3], [2, 4], [3, 1], [4, 5], [1, 6]])
        expected_target = np.array([1, 2, 3, 4, 1, 5])

        npt.assert_array_equal(contexts, expected_contexts)
        npt.assert_array_equal(target, expected_target)
Ejemplo n.º 4
0
def main():
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)

    contexts, target = create_contexts_target(corpus, window_size=1)

    vocab_size = len(word_to_id)
    target = convert_one_hot(target, vocab_size)

    contexts = convert_one_hot(contexts, vocab_size)

    print(vocab_size)
    def test_create_contexts_target(self):
        text = 'you say goodbye and I say hello.'
        corpus, w2id, id2w = preprocess(text)

        contexts, target = create_contexts_target(corpus)

        expected_contexts = np.array([[0, 2], [1, 3], [2, 4], [3, 1], [4, 5],
                                      [1, 6]])
        expected_target = [1, 2, 3, 4, 1, 5]

        np.testing.assert_array_almost_equal(contexts, expected_contexts)
        np.testing.assert_array_almost_equal(target, expected_target)
    def test_convert_one_hot(self):
        text = 'You say goodbye and I say hello.'
        corpus, word_to_id, id_to_word = preprocess(text)
        contexts, target = create_contexts_target(corpus, window_size=1)
        vocab_size = len(word_to_id)
        target = convert_one_hot(target, vocab_size)
        contexts = convert_one_hot(contexts, vocab_size)

        expected_target = np.array([
            [0, 1, 0, 0, 0, 0, 0],
            [0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0],
            [0, 1, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 1, 0]])
        expected_contexts = np.array([
            [
                [1, 0, 0, 0, 0, 0, 0],
                [0, 0, 1, 0, 0, 0, 0]
            ],
            [
                [0, 1, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0]
            ],
            [
                [0, 0, 1, 0, 0, 0, 0],
                [0, 0, 0, 0, 1, 0, 0]
            ],
            [
                [0, 0, 0, 1, 0, 0, 0],
                [0, 1, 0, 0, 0, 0, 0]
            ],
            [
                [0, 0, 0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0, 1, 0]
            ],
            [
                [0, 1, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 1]
            ]])

        npt.assert_array_equal(target, expected_target)
        npt.assert_array_equal(contexts, expected_contexts)
Ejemplo n.º 7
0
def main():
    window_size = 1
    hidden_size = 5
    batch_size = 3
    max_epoch = 1000

    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)

    vocab_size = len(word_to_id)
    contexts, target = create_contexts_target(corpus, window_size)
    target = convert_one_hot(target, vocab_size)
    contexts = convert_one_hot(contexts, vocab_size)

    model = SimpleCBOW(vocab_size, hidden_size)
    optimizer = Adam()
    trainer = Trainer(model, optimizer)

    trainer.fit(contexts, target, max_epoch, batch_size)
    trainer.plot()
Ejemplo n.º 8
0
def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus) - window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)

    return np.array(contexts), np.array(target)


contexts, target = create_contexts_target(corpus, window_size=1)

print(contexts)
print(target)

import sys
sys.path.append('..')
from common.util import preprocess, create_contexts_target, convert_one_hot

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

contexts, target = create_contexts_target(corpus, window_size=1)

vocab_size = len(word_to_id)
target = convert_one_hot(target, vocab_size)
Ejemplo n.º 9
0
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb

# 하이퍼파라미터 설정
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

# 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data('train')
print(type(word_to_id))
exit(1)
vocab_size = len(word_to_id)  # 10000

contexts, target = create_contexts_target(corpus,
                                          window_size)  # 인접한단어, 중심단어 리스트
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

# 모델 등 생성
model = CBOW(vocab_size, hidden_size, window_size, corpus)
# model = SkipGram(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

# 학습 시작
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

# 나중에 사용할 수 있도록 필요한 데이터 저장 (pickle은 파이썬 코드의 객체를 파일로 저장/읽기가 가능함)
word_vecs = model.word_vecs
Ejemplo n.º 10
0
from common.optimizer import Adam
from simple_cbow import SimpleCBOW
from simple_skip_gram import SimpleSkipGram
from common.util import preprocess, create_contexts_target, convert_one_hot

window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000

text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(
    text)  # corpus: 문장에서 단어의 label ('.'포함),

vocab_size = len(word_to_id)  # 문장에서 단어 종류의 수, 7
contexts, target = create_contexts_target(
    corpus, window_size)  # window_size만큼 단어들의 문맥, 단어 label

target = convert_one_hot(target,
                         vocab_size)  #(6,7) target을 vocab_size에 맞게 one_hot
contexts = convert_one_hot(contexts, vocab_size)  # (6,2,7)
print(contexts)
exit(1)

#model = SimpleSkipGram(vocab_size, hidden_size)
model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()