def train_augmented(network_size, dataset_kind, tying=False, epochs=40, stride=0): # prepare the data setting = ProposedSetting(network_size, dataset_kind) dataset = prepare_dataset(dataset_kind) vocab_size = len(dataset.vocab_data()) sequence_size = 20 dp = DataProcessor() train_steps, train_generator = dp.make_batch_iter( dataset, sequence_size=sequence_size, stride=stride) valid_steps, valid_generator = dp.make_batch_iter( dataset, kind="valid", sequence_size=sequence_size, stride=stride) # make one hot model model = AugmentedModel(vocab_size, sequence_size, setting, tying=tying, checkpoint_path=LOG_ROOT) model.compile() model.fit_generator(train_generator, train_steps, valid_generator, valid_steps, epochs=epochs) model.save(MODEL_ROOT)
def prepare_dataset(dataset_kind): dp = DataProcessor() if dataset_kind == "ptb": dataset = dp.get_ptb(DATA_ROOT, vocab_size=10000) else: dataset = dp.get_wiki2(DATA_ROOT, vocab_size=30000) return dataset
def test_model_tying(self): vocab_size = 10 sequence_size = 20 dp = DataProcessor() samples = np.tile( np.array(np.random.randint(vocab_size, size=sequence_size)), 10) x, y = dp.format(samples, vocab_size, sequence_size) x_t, y_t = dp.format(samples, vocab_size, sequence_size) model = AugmentedModel(vocab_size, sequence_size, tying=True) model.compile() print("tying model ---------------") model.fit(x, y, x_t, y_t, epochs=20)
def test_format(self): dp = DataProcessor() samples = np.array([-1] + list(range(10))) x, y = dp.format(samples, 10, 5) # x | y # ---------------------- # -1 0 1 2 3 | 0 1 2 3 4 # 4 5 6 7 8 | 5 6 7 8 9 self.assertEqual(x.shape, (2, 5)) self.assertEqual(y.shape, (2, 5, 10)) for i in range(x.shape[0]): self.assertEqual(x[i][1:].tolist(), np.argmax(y[i][:-1], axis=1).flatten().tolist())
def test_generator(self): data_root = os.path.join(os.path.dirname(__file__), "data") if not os.path.exists(data_root): os.mkdir(data_root) r = chazutsu.datasets.PTB().download(data_root) r_idx = r.to_indexed().make_vocab(vocab_size=10000) dp = DataProcessor() batch_size = 10 sequence_size = 15 vocab_size = len(r_idx.vocab_data()) steps_per_epoch, generator = dp.make_batch_iter( r_idx, "valid", batch_size, sequence_size) words_in_batch = sequence_size * batch_size check_count = 5 max_count = words_in_batch * check_count words = [] with open(r_idx.valid_file_path, encoding="utf-8") as f: for line in f: words += r_idx.str_to_ids(line.strip()) if len(words) > max_count: break for i in range(check_count): X, y = next(generator) self.assertEqual(X.shape, (batch_size, sequence_size)) self.assertEqual(y.shape, (batch_size, sequence_size, vocab_size)) for r in range(X.shape[0]): index = i * words_in_batch seq = words[index + r * sequence_size:][:sequence_size] next_seq = words[index + r * sequence_size + 1:][:sequence_size] self.assertEqual(X[r].tolist(), seq) self.assertEqual( np.argmax(y[r], axis=1).flatten().tolist(), next_seq) generator = None shutil.rmtree(data_root)
def test_one_hot_forward(self): vocab_size = 10 sequence_size = 20 checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints") dp = DataProcessor() test_seq = np.random.randint(vocab_size, size=sequence_size) samples = np.tile(test_seq, 10) x, y = dp.format(samples, vocab_size, sequence_size) x_t, y_t = dp.format(samples, vocab_size, sequence_size) model = OneHotModel(vocab_size, sequence_size, checkpoint_path=checkpoint_path) model.compile() model.fit(x, y, x_t, y_t, epochs=20) print(model.model.optimizer.get_config()) pred_seq = np.random.choice(test_seq, 3) pred = model.predict(pred_seq) # pred will emulates test_seq print(test_seq) for s, p in zip(pred_seq, pred): print("{} -> {}".format(s, p)) shutil.rmtree(checkpoint_path)
def main(kind, epoch): if not os.path.exists(EVAL_ROOT): os.mkdir(EVAL_ROOT) sequence_size = 20 #train_seq = sample_generator(vocab_size, 10000) #valid_seq = sample_generator(vocab_size, 2000) #test_seq = sample_generator(vocab_size, 20) #vocab_size = 100 words, vocab = read_sentences() vocab_size = len(vocab) valid_size = int(len(words) / 4) train_seq = words[:-valid_size] valid_seq = words[-valid_size:-20] test_seq = words[-20:] print("{} train, {} valid ({} vocab)".format(len(train_seq), len(valid_seq), len(vocab))) dp = DataProcessor() x = None y = None x_t = None y_t = None for i in range(sequence_size): tseq = train_seq[i:] vseq = valid_seq[i:] _x, _y = dp.format(tseq, vocab_size, sequence_size) if x is None: x = _x y = _y else: x = np.vstack((x, _x)) y = np.vstack((y, _y)) _x_t, _y_t = dp.format(vseq, vocab_size, sequence_size) if x_t is None: x_t = _x_t y_t = _y_t else: x_t = np.vstack((x_t, _x_t)) y_t = np.vstack((y_t, _y_t)) if kind == 0: print("Build OneHot Model") model = OneHotModel(vocab_size, sequence_size, checkpoint_path=EVAL_ROOT) elif kind == 1: print("Build Augmented Model") model = AugmentedModel(vocab_size, sequence_size, checkpoint_path=EVAL_ROOT) elif kind == 2: print("Build Augmented(Tying) Model") model = AugmentedModel(vocab_size, sequence_size, tying=True, checkpoint_path=EVAL_ROOT) else: raise Exception("Model kind is not specified!") model.compile() model.fit(x, y, x_t, y_t, epochs=epoch) model_pred = model.predict(test_seq) rev_vocab = {v: k for k, v in vocab.items()} print([rev_vocab[i] for i in test_seq]) for s, p in zip(test_seq, model_pred): print("{} -> {}".format(rev_vocab[s], rev_vocab[p]))