def test_load_vocab():
    words, vocab = util.load_vocab("testdata/test_vocab.txt")
    assert len(words) == 6
    assert len(vocab) == 5
    assert words[1] == "hello"
    assert vocab["hello"] == 1
    assert vocab["HI"] == 5
    assert words[5] == "HI"

    words, vocab = util.load_vocab("testdata/test_vocab.txt", 2)
    assert len(vocab) == 2
    assert "HI" not in vocab.keys()
def test_load_data():
    words, vocab = util.load_vocab("testdata/test_vocab_lower.txt")
    X, Xu = util.load_data("testdata/dir1", vocab, pad=4, lowercase=True)

    assert_equal(X, [0, 1, 2, 2, 0, 5, 4, 3, 0, 0, 0, 0, 0])
    assert_equal(Xu, [[1, 1], [0, 0], [0, 1], [0, 0], [1, 0],
                      [0, 1], [0, 0], [0, 0], [1, 0],
                      [0, 0], [0, 0], [0, 0], [0, 0]])
def test_text2seq():
    words, vocab = util.load_vocab("testdata/test_vocab.txt")
    t2s = util.Text2Seq(vocab)
    text = "    Ahoy hello world hey HI 2 1 \n meow"
    tokens, unknown = t2s.toseq(text)

    assert tokens == [0, 1, 2, 0, 5, 4, 3, 0]
    assert unknown == [[1, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [1, 0]]
def test_text2seq_lower():
    words, vocab = util.load_vocab("testdata/test_vocab_lower.txt")
    t2s = util.Text2Seq(vocab, vocab_is_lowercase=True)
    text = "    Ahoy hello World world hey HI 2 1 \n meow"
    tokens, aux = t2s.toseq(text)

    assert_equal(tokens, [0, 1, 2, 2, 0, 5, 4, 3, 0])
    assert_equal(aux, [[1, 1], [0, 0], [0, 1], [0, 0], [1, 0],
                       [0, 1], [0, 0], [0, 0], [1, 0]])
def test_load_embeddings():
    words, vocab = util.load_vocab("testdata/test_vocab.txt")
    em = util.load_embeddings(vocab, 3, "testdata/test_embedding.txt")

    assert_equal(em, array([[0.0, 0.0, 0.0],
                            [4.1, 4.2, 4.3],
                            [5.1, -5.2, 5.3],
                            [-2.1, 2.2, -2.3],
                            [-3.1, 3.2, 3.333],
                            [10.0, 20.0, 30.0]], dtype="float32"))
def sample(**kwargs):
    p = Box(kwargs)
    cols = 80
    info("Loading vocabulary")
    words, vocab = util.load_vocab(p.vocab_file, p.vocab_size)

    t2s = Text2Seq(vocab, vocab_is_lowercase=p.vocab_is_lowercase)
    X, Xu = t2s.toseq(p.init_text)
    gen = util.padleft(X, p.seqlen).tolist()
    genu = util.padleft(Xu, p.seqlen).tolist()

    model_train = keras.models.load_model(p.model_file)
    model = make_predict_model(model_train)

    info("=" * 100)
    s = ""
    for i, idx in enumerate(gen):
        word = "<UNK>"
        if genu[i][0] < 0.1:
            word = words[idx]
        if genu[i][1] > 0.9:
            word = util.capitalize(word)
        s += word + " "
    info(textwrap.fill(s, 80))

    print()
    info("=" * 100)
    UNK_IDX = len(words)

    punct = ":-;.,!?'\")"
    punct2 = "-/'(\""

    prev_word = words[gen[-1]]
    word = ""

    chars = 0  # chars printed out on this line so far
    tX = np.zeros((1, p.seqlen), dtype="int32")
    tXu = np.zeros((1, p.seqlen, 2), dtype="float32")
    results = []

    for j in range(p.num_words_to_sample):
        tX[0] = np.array(gen[-p.seqlen:], "int32")
        tXu[0] = np.array(genu[-p.seqlen:], "float32")
        z = model.predict([tX, tXu])
        scores, aux = z[0][:-2], z[0][-2:]
        idx = UNK_IDX
        while idx == UNK_IDX or idx == 0:
            idx = np.random.choice(range(len(vocab) + 2), p=scores)

        gen.append(idx)
        genu.append([0.0, aux[1]])
        word = words[idx]
        if aux[1] > 0.5: word = capitalize(word)
        results.append(word)

        if cols - chars < len(word) + 1:
            sys.stdout.write("\n")
            chars = 0
        if punct.find(word) < 0 and punct2.find(prev_word) < 0:
            sys.stdout.write(" ")
            chars += 1
        sys.stdout.write(word)
        chars += len(word)
        sys.stdout.flush()

        prev_word = word

    print()
def train(vocab_file, vocab_is_lowercase, glove_file, glove_dims,
          training_data_dir, training_max_files, checkpoint_dir,
          starting_model_file, seqlen, vocab_size, lstm_size, lstm_layers,
          dense_size, dense_layers, dropout_rate, sample_size,
          learning_rate_initial, learning_rate_decay_rate,
          learning_rate_decay_period, learning_rate_floor, batch_size,
          num_epochs, starting_epoch, epochs_per_dataset, cache_dir):
    if cache_dir:
        pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True)

    info("Loading vocabulary from ", vocab_file)
    words, vocab = util.load_vocab(vocab_file, vocab_size)
    info("Loaded", len(vocab), "words")
    # print(vocab['test'])

    if starting_model_file:
        info("Loading model from ", starting_model_file)
        model = keras.models.load_model(starting_model_file)
    else:
        emb_matrix_cache_path = os.path.join(cache_dir, "emb_matrix")
        if cache_dir and os.path.exists(emb_matrix_cache_path):
            info("Loading embedding matrix from cache")
            with FileIO(emb_matrix_cache_path, "rb") as f:
                emb_matrix = pickle.Unpickler(f).load()
        else:
            info("Loading embedding matrix")
            emb_matrix = util.load_embeddings(vocab, glove_dims, glove_file)
            if cache_dir:
                info("Writing embedding matrix cache")
                with FileIO(emb_matrix_cache_path, "wb") as f:
                    pickle.Pickler(
                        f, protocol=pickle.HIGHEST_PROTOCOL).dump(emb_matrix)
        info("Creating model")
        model = make_model(emb_matrix=emb_matrix,
                           vocab=vocab,
                           seqlen=seqlen,
                           sample_size=sample_size,
                           lstm_size=lstm_size,
                           dense_size=dense_size,
                           dense_layers=dense_layers,
                           lstm_layers=lstm_layers,
                           dropout_rate=dropout_rate)

    training_data_cache_path = os.path.join(cache_dir, "training_data")
    if cache_dir and os.path.exists(training_data_cache_path):
        info("Loading training data from cache:", training_data_cache_path)
        with FileIO(training_data_cache_path, "rb") as f:
            unpickler = pickle.Unpickler(f)
            X, Xu = unpickler.load()
    else:
        info("Loading training data from", training_data_dir)
        X, Xu = util.load_data(training_data_dir,
                               vocab,
                               pad=seqlen,
                               numfiles=training_max_files,
                               lowercase=vocab_is_lowercase)
        if cache_dir:
            info("Saving prepared training data to cache:",
                 training_data_cache_path)
            with FileIO(training_data_cache_path, "wb") as f:
                pickler = pickle.Pickler(f, pickle.HIGHEST_PROTOCOL)
                pickler.dump([X, Xu])

    info("Unknown words in training data: %.2f%%" %
         util.unknown_word_percentage(Xu))

    checkpoint_filepath = "weights.lstm%d.batch%d.glove%d.sample%d.vocab%d.%s.hdf5" % (
        lstm_size, batch_size, glove_dims, sample_size, vocab_size, "default")
    checkpoint_filepath = os.path.join(checkpoint_dir, checkpoint_filepath)
    info("Will write checkpoints to:", checkpoint_filepath)
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath,
                                                 verbose=2,
                                                 save_best_only=True,
                                                 monitor='loss')

    def decay(epoch):
        lr = learning_rate_initial * math.pow(
            learning_rate_decay_rate,
            math.floor(epoch / learning_rate_decay_period))
        lr = max(lr, learning_rate_floor)
        return lr

    decay_scheduler = LearningRateScheduler(decay, verbose=1)

    optimizer = keras.optimizers.Adam(lr=0.0)
    model.compile(optimizer,
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])

    train_seq = util.NegativeSamplingPermutedSequence(data_x=X,
                                                      data_xu=Xu,
                                                      seqlen=seqlen,
                                                      batch_size=batch_size,
                                                      sample_size=sample_size,
                                                      vocab_size=len(vocab) +
                                                      1)
    steps_per_epoch = int(
        math.floor(len(X) / (batch_size * epochs_per_dataset)))

    model.fit_generator(train_seq,
                        steps_per_epoch=steps_per_epoch,
                        epochs=num_epochs,
                        callbacks=[checkpoint, decay_scheduler],
                        initial_epoch=starting_epoch,
                        verbose=1,
                        use_multiprocessing=False,
                        workers=8,
                        max_queue_size=64)
def train(vocab_file, vocab_is_lowercase, glove_file, glove_dims,
          training_data_dir, training_max_files, checkpoint_dir,
          starting_model_file, seqlen, vocab_size, lstm_size, lstm_layers,
          dense_size, dense_layers, dropout_rate, sample_size,
          learning_rate_initial, learning_rate_decay_rate,
          learning_rate_decay_period, learning_rate_floor, batch_size,
          num_epochs, starting_epoch, epochs_per_dataset, cache_dir):
    if cache_dir:
        pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True)

    info("Loading vocabulary from ", vocab_file)
    words, vocab = util.load_vocab(vocab_file, vocab_size)
    info("Loaded", len(vocab), "words")
    # print(vocab['test'])

    if starting_model_file:
        info("Loading model from ", starting_model_file)
        model = keras.models.load_model(starting_model_file)
    else:
        emb_matrix = load_embedding_matrix(cache_dir, glove_dims, glove_file,
                                           vocab)
        info("Creating model")
        model = make_model(emb_matrix=emb_matrix,
                           vocab=vocab,
                           seqlen=seqlen,
                           sample_size=sample_size,
                           lstm_size=lstm_size,
                           dense_size=dense_size,
                           dense_layers=dense_layers,
                           lstm_layers=lstm_layers,
                           dropout_rate=dropout_rate)

    X, Xu = load_training_data(cache_dir, seqlen, training_data_dir,
                               training_max_files, vocab, vocab_is_lowercase)

    checkpoint_filepath = "weights.lstm%d.batch%d.glove%d.sample%d.vocab%d.%s.hdf5" % (
        lstm_size, batch_size, glove_dims, sample_size, vocab_size, "default")
    checkpoint_filepath = os.path.join(checkpoint_dir, checkpoint_filepath)
    info("Will write checkpoints to:", checkpoint_filepath)
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath,
                                                 verbose=2,
                                                 save_best_only=True,
                                                 monitor='loss')

    def decay(epoch):
        lr = learning_rate_initial * math.pow(
            learning_rate_decay_rate,
            math.floor(epoch / learning_rate_decay_period))
        lr = max(lr, learning_rate_floor)
        return lr

    decay_scheduler = LearningRateScheduler(decay, verbose=1)
    optimizer = keras.optimizers.Adam(lr=0.0)
    model.compile(optimizer,
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])
    train_seq = util.NegativeSamplingPermutedSequence(data_x=X,
                                                      data_xu=Xu,
                                                      seqlen=seqlen,
                                                      batch_size=batch_size,
                                                      sample_size=sample_size,
                                                      vocab_size=len(vocab) +
                                                      1)
    steps_per_epoch = int(
        math.floor(len(X) / (batch_size * epochs_per_dataset)))

    model.fit_generator(train_seq,
                        steps_per_epoch=steps_per_epoch,
                        epochs=num_epochs,
                        callbacks=[checkpoint, decay_scheduler],
                        initial_epoch=starting_epoch,
                        verbose=1,
                        use_multiprocessing=False,
                        workers=8,
                        max_queue_size=64)