コード例 #1
0
def train_augmented(network_size,
                    dataset_kind,
                    tying=False,
                    epochs=40,
                    stride=0):
    # prepare the data
    setting = ProposedSetting(network_size, dataset_kind)
    dataset = prepare_dataset(dataset_kind)
    vocab_size = len(dataset.vocab_data())
    sequence_size = 20

    dp = DataProcessor()
    train_steps, train_generator = dp.make_batch_iter(
        dataset, sequence_size=sequence_size, stride=stride)
    valid_steps, valid_generator = dp.make_batch_iter(
        dataset, kind="valid", sequence_size=sequence_size, stride=stride)

    # make one hot model
    model = AugmentedModel(vocab_size,
                           sequence_size,
                           setting,
                           tying=tying,
                           checkpoint_path=LOG_ROOT)
    model.compile()
    model.fit_generator(train_generator,
                        train_steps,
                        valid_generator,
                        valid_steps,
                        epochs=epochs)
    model.save(MODEL_ROOT)
コード例 #2
0
def prepare_dataset(dataset_kind):
    dp = DataProcessor()
    if dataset_kind == "ptb":
        dataset = dp.get_ptb(DATA_ROOT, vocab_size=10000)
    else:
        dataset = dp.get_wiki2(DATA_ROOT, vocab_size=30000)

    return dataset
コード例 #3
0
    def test_model_tying(self):
        vocab_size = 10
        sequence_size = 20

        dp = DataProcessor()
        samples = np.tile(
            np.array(np.random.randint(vocab_size, size=sequence_size)), 10)
        x, y = dp.format(samples, vocab_size, sequence_size)
        x_t, y_t = dp.format(samples, vocab_size, sequence_size)

        model = AugmentedModel(vocab_size, sequence_size, tying=True)
        model.compile()
        print("tying model ---------------")
        model.fit(x, y, x_t, y_t, epochs=20)
コード例 #4
0
    def test_format(self):
        dp = DataProcessor()
        samples = np.array([-1] + list(range(10)))
        x, y = dp.format(samples, 10, 5)

        # x          | y
        # ----------------------
        # -1 0 1 2 3 | 0 1 2 3 4
        #  4 5 6 7 8 | 5 6 7 8 9

        self.assertEqual(x.shape, (2, 5))
        self.assertEqual(y.shape, (2, 5, 10))
        for i in range(x.shape[0]):
            self.assertEqual(x[i][1:].tolist(),
                             np.argmax(y[i][:-1], axis=1).flatten().tolist())
コード例 #5
0
    def test_generator(self):
        data_root = os.path.join(os.path.dirname(__file__), "data")
        if not os.path.exists(data_root):
            os.mkdir(data_root)

        r = chazutsu.datasets.PTB().download(data_root)
        r_idx = r.to_indexed().make_vocab(vocab_size=10000)

        dp = DataProcessor()
        batch_size = 10
        sequence_size = 15
        vocab_size = len(r_idx.vocab_data())
        steps_per_epoch, generator = dp.make_batch_iter(
            r_idx, "valid", batch_size, sequence_size)

        words_in_batch = sequence_size * batch_size
        check_count = 5
        max_count = words_in_batch * check_count
        words = []
        with open(r_idx.valid_file_path, encoding="utf-8") as f:
            for line in f:
                words += r_idx.str_to_ids(line.strip())
                if len(words) > max_count:
                    break

        for i in range(check_count):
            X, y = next(generator)
            self.assertEqual(X.shape, (batch_size, sequence_size))
            self.assertEqual(y.shape, (batch_size, sequence_size, vocab_size))
            for r in range(X.shape[0]):
                index = i * words_in_batch
                seq = words[index + r * sequence_size:][:sequence_size]
                next_seq = words[index + r * sequence_size +
                                 1:][:sequence_size]
                self.assertEqual(X[r].tolist(), seq)
                self.assertEqual(
                    np.argmax(y[r], axis=1).flatten().tolist(), next_seq)

        generator = None
        shutil.rmtree(data_root)
コード例 #6
0
    def test_one_hot_forward(self):
        vocab_size = 10
        sequence_size = 20
        checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints")

        dp = DataProcessor()
        test_seq = np.random.randint(vocab_size, size=sequence_size)
        samples = np.tile(test_seq, 10)
        x, y = dp.format(samples, vocab_size, sequence_size)
        x_t, y_t = dp.format(samples, vocab_size, sequence_size)

        model = OneHotModel(vocab_size, sequence_size, checkpoint_path=checkpoint_path)
        model.compile()

        model.fit(x, y, x_t, y_t, epochs=20)
        print(model.model.optimizer.get_config())
        pred_seq = np.random.choice(test_seq, 3)
        pred = model.predict(pred_seq)
        # pred will emulates test_seq
        print(test_seq)
        for s, p in zip(pred_seq, pred):
            print("{} -> {}".format(s, p))

        shutil.rmtree(checkpoint_path)
コード例 #7
0
def main(kind, epoch):
    if not os.path.exists(EVAL_ROOT):
        os.mkdir(EVAL_ROOT)

    sequence_size = 20

    #train_seq = sample_generator(vocab_size, 10000)
    #valid_seq = sample_generator(vocab_size, 2000)
    #test_seq = sample_generator(vocab_size, 20)
    #vocab_size = 100

    words, vocab = read_sentences()
    vocab_size = len(vocab)
    valid_size = int(len(words) / 4)
    train_seq = words[:-valid_size]
    valid_seq = words[-valid_size:-20]
    test_seq = words[-20:]
    print("{} train, {} valid ({} vocab)".format(len(train_seq),
                                                 len(valid_seq), len(vocab)))

    dp = DataProcessor()
    x = None
    y = None
    x_t = None
    y_t = None
    for i in range(sequence_size):
        tseq = train_seq[i:]
        vseq = valid_seq[i:]
        _x, _y = dp.format(tseq, vocab_size, sequence_size)
        if x is None:
            x = _x
            y = _y
        else:
            x = np.vstack((x, _x))
            y = np.vstack((y, _y))

        _x_t, _y_t = dp.format(vseq, vocab_size, sequence_size)
        if x_t is None:
            x_t = _x_t
            y_t = _y_t
        else:
            x_t = np.vstack((x_t, _x_t))
            y_t = np.vstack((y_t, _y_t))

    if kind == 0:
        print("Build OneHot Model")
        model = OneHotModel(vocab_size,
                            sequence_size,
                            checkpoint_path=EVAL_ROOT)
    elif kind == 1:
        print("Build Augmented Model")
        model = AugmentedModel(vocab_size,
                               sequence_size,
                               checkpoint_path=EVAL_ROOT)
    elif kind == 2:
        print("Build Augmented(Tying) Model")
        model = AugmentedModel(vocab_size,
                               sequence_size,
                               tying=True,
                               checkpoint_path=EVAL_ROOT)
    else:
        raise Exception("Model kind is not specified!")

    model.compile()
    model.fit(x, y, x_t, y_t, epochs=epoch)
    model_pred = model.predict(test_seq)

    rev_vocab = {v: k for k, v in vocab.items()}
    print([rev_vocab[i] for i in test_seq])
    for s, p in zip(test_seq, model_pred):
        print("{} -> {}".format(rev_vocab[s], rev_vocab[p]))