def main(): # ハイパーパラメータの設定 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 # データの読み込み corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size) # モデルなどの生成 model = CBOW(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) # 学習開始 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() # 後ほど利用できるように、必要なデータを保存 word_vecs = model.word_vecs params = {} params['word_vecs'] = word_vecs.astype(np.float16) params['word_to_id'] = word_to_id params['id_to_word'] = id_to_word pkl_file = 'cbow_params.pkl' with open(pkl_file, 'wb') as f: pickle.dump(params, f, -1)
def test_convert_one_hot_contexts(self): text = 'you say goodbye and I say hello.' corpus, w2id, id2w = preprocess(text) contexts, target = create_contexts_target(corpus, 1) contexts = convert_one_hot(contexts, len(w2id)) self.assertEqual(contexts.shape, (6, 2, 7))
def test_create_contexts_target(self): text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) contexts, target = create_contexts_target(corpus, window_size=1) expected_contexts = np.array([[0, 2], [1, 3], [2, 4], [3, 1], [4, 5], [1, 6]]) expected_target = np.array([1, 2, 3, 4, 1, 5]) npt.assert_array_equal(contexts, expected_contexts) npt.assert_array_equal(target, expected_target)
def main(): text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) contexts, target = create_contexts_target(corpus, window_size=1) vocab_size = len(word_to_id) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) print(vocab_size)
def test_create_contexts_target(self): text = 'you say goodbye and I say hello.' corpus, w2id, id2w = preprocess(text) contexts, target = create_contexts_target(corpus) expected_contexts = np.array([[0, 2], [1, 3], [2, 4], [3, 1], [4, 5], [1, 6]]) expected_target = [1, 2, 3, 4, 1, 5] np.testing.assert_array_almost_equal(contexts, expected_contexts) np.testing.assert_array_almost_equal(target, expected_target)
def test_convert_one_hot(self): text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) contexts, target = create_contexts_target(corpus, window_size=1) vocab_size = len(word_to_id) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) expected_target = np.array([ [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0]]) expected_contexts = np.array([ [ [1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0] ], [ [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0] ], [ [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0] ], [ [0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0] ], [ [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0] ], [ [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1] ]]) npt.assert_array_equal(target, expected_target) npt.assert_array_equal(contexts, expected_contexts)
def main(): window_size = 1 hidden_size = 5 batch_size = 3 max_epoch = 1000 text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) contexts, target = create_contexts_target(corpus, window_size) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot()
def create_contexts_target(corpus, window_size=1): target = corpus[window_size:-window_size] contexts = [] for idx in range(window_size, len(corpus) - window_size): cs = [] for t in range(-window_size, window_size + 1): if t == 0: continue cs.append(corpus[idx + t]) contexts.append(cs) return np.array(contexts), np.array(target) contexts, target = create_contexts_target(corpus, window_size=1) print(contexts) print(target) import sys sys.path.append('..') from common.util import preprocess, create_contexts_target, convert_one_hot text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) contexts, target = create_contexts_target(corpus, window_size=1) vocab_size = len(word_to_id) target = convert_one_hot(target, vocab_size)
from common.util import create_contexts_target, to_cpu, to_gpu from dataset import ptb # 하이퍼파라미터 설정 window_size = 5 hidden_size = 100 batch_size = 100 max_epoch = 10 # 데이터 읽기 corpus, word_to_id, id_to_word = ptb.load_data('train') print(type(word_to_id)) exit(1) vocab_size = len(word_to_id) # 10000 contexts, target = create_contexts_target(corpus, window_size) # 인접한단어, 중심단어 리스트 if config.GPU: contexts, target = to_gpu(contexts), to_gpu(target) # 모델 등 생성 model = CBOW(vocab_size, hidden_size, window_size, corpus) # model = SkipGram(vocab_size, hidden_size, window_size, corpus) optimizer = Adam() trainer = Trainer(model, optimizer) # 학습 시작 trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot() # 나중에 사용할 수 있도록 필요한 데이터 저장 (pickle은 파이썬 코드의 객체를 파일로 저장/읽기가 가능함) word_vecs = model.word_vecs
from common.optimizer import Adam from simple_cbow import SimpleCBOW from simple_skip_gram import SimpleSkipGram from common.util import preprocess, create_contexts_target, convert_one_hot window_size = 1 hidden_size = 5 batch_size = 3 max_epoch = 1000 text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess( text) # corpus: 문장에서 단어의 label ('.'포함), vocab_size = len(word_to_id) # 문장에서 단어 종류의 수, 7 contexts, target = create_contexts_target( corpus, window_size) # window_size만큼 단어들의 문맥, 단어 label target = convert_one_hot(target, vocab_size) #(6,7) target을 vocab_size에 맞게 one_hot contexts = convert_one_hot(contexts, vocab_size) # (6,2,7) print(contexts) exit(1) #model = SimpleSkipGram(vocab_size, hidden_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot()