def test_load_vocab(): words, vocab = util.load_vocab("testdata/test_vocab.txt") assert len(words) == 6 assert len(vocab) == 5 assert words[1] == "hello" assert vocab["hello"] == 1 assert vocab["HI"] == 5 assert words[5] == "HI" words, vocab = util.load_vocab("testdata/test_vocab.txt", 2) assert len(vocab) == 2 assert "HI" not in vocab.keys()
def test_load_data(): words, vocab = util.load_vocab("testdata/test_vocab_lower.txt") X, Xu = util.load_data("testdata/dir1", vocab, pad=4, lowercase=True) assert_equal(X, [0, 1, 2, 2, 0, 5, 4, 3, 0, 0, 0, 0, 0]) assert_equal(Xu, [[1, 1], [0, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [0, 0]])
def test_text2seq(): words, vocab = util.load_vocab("testdata/test_vocab.txt") t2s = util.Text2Seq(vocab) text = " Ahoy hello world hey HI 2 1 \n meow" tokens, unknown = t2s.toseq(text) assert tokens == [0, 1, 2, 0, 5, 4, 3, 0] assert unknown == [[1, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0], [0, 0], [1, 0]]
def test_text2seq_lower(): words, vocab = util.load_vocab("testdata/test_vocab_lower.txt") t2s = util.Text2Seq(vocab, vocab_is_lowercase=True) text = " Ahoy hello World world hey HI 2 1 \n meow" tokens, aux = t2s.toseq(text) assert_equal(tokens, [0, 1, 2, 2, 0, 5, 4, 3, 0]) assert_equal(aux, [[1, 1], [0, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, 0], [0, 0], [1, 0]])
def test_load_embeddings(): words, vocab = util.load_vocab("testdata/test_vocab.txt") em = util.load_embeddings(vocab, 3, "testdata/test_embedding.txt") assert_equal(em, array([[0.0, 0.0, 0.0], [4.1, 4.2, 4.3], [5.1, -5.2, 5.3], [-2.1, 2.2, -2.3], [-3.1, 3.2, 3.333], [10.0, 20.0, 30.0]], dtype="float32"))
def sample(**kwargs): p = Box(kwargs) cols = 80 info("Loading vocabulary") words, vocab = util.load_vocab(p.vocab_file, p.vocab_size) t2s = Text2Seq(vocab, vocab_is_lowercase=p.vocab_is_lowercase) X, Xu = t2s.toseq(p.init_text) gen = util.padleft(X, p.seqlen).tolist() genu = util.padleft(Xu, p.seqlen).tolist() model_train = keras.models.load_model(p.model_file) model = make_predict_model(model_train) info("=" * 100) s = "" for i, idx in enumerate(gen): word = "<UNK>" if genu[i][0] < 0.1: word = words[idx] if genu[i][1] > 0.9: word = util.capitalize(word) s += word + " " info(textwrap.fill(s, 80)) print() info("=" * 100) UNK_IDX = len(words) punct = ":-;.,!?'\")" punct2 = "-/'(\"" prev_word = words[gen[-1]] word = "" chars = 0 # chars printed out on this line so far tX = np.zeros((1, p.seqlen), dtype="int32") tXu = np.zeros((1, p.seqlen, 2), dtype="float32") results = [] for j in range(p.num_words_to_sample): tX[0] = np.array(gen[-p.seqlen:], "int32") tXu[0] = np.array(genu[-p.seqlen:], "float32") z = model.predict([tX, tXu]) scores, aux = z[0][:-2], z[0][-2:] idx = UNK_IDX while idx == UNK_IDX or idx == 0: idx = np.random.choice(range(len(vocab) + 2), p=scores) gen.append(idx) genu.append([0.0, aux[1]]) word = words[idx] if aux[1] > 0.5: word = capitalize(word) results.append(word) if cols - chars < len(word) + 1: sys.stdout.write("\n") chars = 0 if punct.find(word) < 0 and punct2.find(prev_word) < 0: sys.stdout.write(" ") chars += 1 sys.stdout.write(word) chars += len(word) sys.stdout.flush() prev_word = word print()
def train(vocab_file, vocab_is_lowercase, glove_file, glove_dims, training_data_dir, training_max_files, checkpoint_dir, starting_model_file, seqlen, vocab_size, lstm_size, lstm_layers, dense_size, dense_layers, dropout_rate, sample_size, learning_rate_initial, learning_rate_decay_rate, learning_rate_decay_period, learning_rate_floor, batch_size, num_epochs, starting_epoch, epochs_per_dataset, cache_dir): if cache_dir: pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True) info("Loading vocabulary from ", vocab_file) words, vocab = util.load_vocab(vocab_file, vocab_size) info("Loaded", len(vocab), "words") # print(vocab['test']) if starting_model_file: info("Loading model from ", starting_model_file) model = keras.models.load_model(starting_model_file) else: emb_matrix_cache_path = os.path.join(cache_dir, "emb_matrix") if cache_dir and os.path.exists(emb_matrix_cache_path): info("Loading embedding matrix from cache") with FileIO(emb_matrix_cache_path, "rb") as f: emb_matrix = pickle.Unpickler(f).load() else: info("Loading embedding matrix") emb_matrix = util.load_embeddings(vocab, glove_dims, glove_file) if cache_dir: info("Writing embedding matrix cache") with FileIO(emb_matrix_cache_path, "wb") as f: pickle.Pickler( f, protocol=pickle.HIGHEST_PROTOCOL).dump(emb_matrix) info("Creating model") model = make_model(emb_matrix=emb_matrix, vocab=vocab, seqlen=seqlen, sample_size=sample_size, lstm_size=lstm_size, dense_size=dense_size, dense_layers=dense_layers, lstm_layers=lstm_layers, dropout_rate=dropout_rate) training_data_cache_path = os.path.join(cache_dir, "training_data") if cache_dir and os.path.exists(training_data_cache_path): info("Loading training data from cache:", training_data_cache_path) with FileIO(training_data_cache_path, "rb") as f: unpickler = pickle.Unpickler(f) X, Xu = unpickler.load() else: info("Loading training data from", training_data_dir) X, Xu = util.load_data(training_data_dir, vocab, pad=seqlen, numfiles=training_max_files, lowercase=vocab_is_lowercase) if cache_dir: info("Saving prepared training data to cache:", training_data_cache_path) with FileIO(training_data_cache_path, "wb") as f: pickler = pickle.Pickler(f, pickle.HIGHEST_PROTOCOL) pickler.dump([X, Xu]) info("Unknown words in training data: %.2f%%" % util.unknown_word_percentage(Xu)) checkpoint_filepath = "weights.lstm%d.batch%d.glove%d.sample%d.vocab%d.%s.hdf5" % ( lstm_size, batch_size, glove_dims, sample_size, vocab_size, "default") checkpoint_filepath = os.path.join(checkpoint_dir, checkpoint_filepath) info("Will write checkpoints to:", checkpoint_filepath) checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath, verbose=2, save_best_only=True, monitor='loss') def decay(epoch): lr = learning_rate_initial * math.pow( learning_rate_decay_rate, math.floor(epoch / learning_rate_decay_period)) lr = max(lr, learning_rate_floor) return lr decay_scheduler = LearningRateScheduler(decay, verbose=1) optimizer = keras.optimizers.Adam(lr=0.0) model.compile(optimizer, loss='categorical_crossentropy', metrics=["accuracy"]) train_seq = util.NegativeSamplingPermutedSequence(data_x=X, data_xu=Xu, seqlen=seqlen, batch_size=batch_size, sample_size=sample_size, vocab_size=len(vocab) + 1) steps_per_epoch = int( math.floor(len(X) / (batch_size * epochs_per_dataset))) model.fit_generator(train_seq, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[checkpoint, decay_scheduler], initial_epoch=starting_epoch, verbose=1, use_multiprocessing=False, workers=8, max_queue_size=64)
def train(vocab_file, vocab_is_lowercase, glove_file, glove_dims, training_data_dir, training_max_files, checkpoint_dir, starting_model_file, seqlen, vocab_size, lstm_size, lstm_layers, dense_size, dense_layers, dropout_rate, sample_size, learning_rate_initial, learning_rate_decay_rate, learning_rate_decay_period, learning_rate_floor, batch_size, num_epochs, starting_epoch, epochs_per_dataset, cache_dir): if cache_dir: pathlib.Path(cache_dir).mkdir(parents=True, exist_ok=True) info("Loading vocabulary from ", vocab_file) words, vocab = util.load_vocab(vocab_file, vocab_size) info("Loaded", len(vocab), "words") # print(vocab['test']) if starting_model_file: info("Loading model from ", starting_model_file) model = keras.models.load_model(starting_model_file) else: emb_matrix = load_embedding_matrix(cache_dir, glove_dims, glove_file, vocab) info("Creating model") model = make_model(emb_matrix=emb_matrix, vocab=vocab, seqlen=seqlen, sample_size=sample_size, lstm_size=lstm_size, dense_size=dense_size, dense_layers=dense_layers, lstm_layers=lstm_layers, dropout_rate=dropout_rate) X, Xu = load_training_data(cache_dir, seqlen, training_data_dir, training_max_files, vocab, vocab_is_lowercase) checkpoint_filepath = "weights.lstm%d.batch%d.glove%d.sample%d.vocab%d.%s.hdf5" % ( lstm_size, batch_size, glove_dims, sample_size, vocab_size, "default") checkpoint_filepath = os.path.join(checkpoint_dir, checkpoint_filepath) info("Will write checkpoints to:", checkpoint_filepath) checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath, verbose=2, save_best_only=True, monitor='loss') def decay(epoch): lr = learning_rate_initial * math.pow( learning_rate_decay_rate, math.floor(epoch / learning_rate_decay_period)) lr = max(lr, learning_rate_floor) return lr decay_scheduler = LearningRateScheduler(decay, verbose=1) optimizer = keras.optimizers.Adam(lr=0.0) model.compile(optimizer, loss='categorical_crossentropy', metrics=["accuracy"]) train_seq = util.NegativeSamplingPermutedSequence(data_x=X, data_xu=Xu, seqlen=seqlen, batch_size=batch_size, sample_size=sample_size, vocab_size=len(vocab) + 1) steps_per_epoch = int( math.floor(len(X) / (batch_size * epochs_per_dataset))) model.fit_generator(train_seq, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[checkpoint, decay_scheduler], initial_epoch=starting_epoch, verbose=1, use_multiprocessing=False, workers=8, max_queue_size=64)