def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder, in_config_file, in_custom_vocab): with open(in_config_file) as config_in: config = json.load(config_in) train_utterances = load_txt(in_trainset_file) dev_utterances = load_txt(in_devset_file) test_utterances = load_txt(in_testset_file) if in_custom_vocab is not None: with open(in_custom_vocab) as vocab_in: rev_vocab = [line.rstrip() for line in vocab_in] vocab = {word: idx for idx, word in enumerate(rev_vocab)} else: vocab, rev_vocab = make_vocabulary( train_utterances, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS]) config['vocabulary_size'] = len(vocab) train_enc_inp, _, train_dec_out, _ = make_variational_autoencoder_dataset( train_utterances, vocab, config['max_sequence_length']) dev_enc_inp, _, dev_dec_out, _ = make_variational_autoencoder_dataset( dev_utterances, vocab, config['max_sequence_length']) test_enc_inp, _, test_dec_out, _ = make_variational_autoencoder_dataset( test_utterances, vocab, config['max_sequence_length']) with tf.Session() as sess: ae = CompatibleRNNAutoencoder(config, rev_vocab) sess.run(tf.global_variables_initializer()) train(sess, ae, (train_enc_inp, train_dec_out), (dev_enc_inp, dev_dec_out), in_model_folder, **config)
def main(in_trainset_file, in_devset_file, in_testset_file, in_config, in_model_folder): train_utterances, dev_utterances, test_utterances = load_txt( in_trainset_file), load_txt(in_devset_file), load_txt(in_testset_file) vocab, rev_vocab = make_vocabulary(train_utterances, in_config['max_vocabulary_size'], frequency_threshold=0, ngram_sizes=(1, )) config['vocabulary_size'] = len(vocab) train_X = make_variational_autoencoder_dataset( train_utterances, vocab, config['max_sequence_length']) dev_X = make_variational_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) test_X = make_variational_autoencoder_dataset( test_utterances, vocab, config['max_sequence_length']) # save_model(vocab, config, in_model_folder) with tf.Session() as sess: model = VRAE(config, rev_vocab, sess, standalone=True) train(model, train_X, dev_X, config, in_model_folder)
def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder, in_config_file): with open(in_config_file) as config_in: config = json.load(config_in) train_utterances = load_txt(in_trainset_file) dev_utterances = load_txt(in_devset_file) test_utterances = load_txt(in_testset_file) vocab, rev_vocab = make_vocabulary(train_utterances, config['max_vocabulary_size']) config['vocabulary_size'] = len(vocab) train_data = make_autoencoder_dataset(train_utterances, vocab, config['max_sequence_length']) dev_data = make_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) test_data = make_autoencoder_dataset(test_utterances, vocab, config['max_sequence_length']) with tf.Session() as sess: ae = RNNVAE(config, rev_vocab) sess.run(tf.global_variables_initializer()) train(sess, ae, train_data, dev_data, in_model_folder, **config)
def main(in_model_folder, in_devset_file, in_evalset_file, in_decision_type): dev_utterances = load_txt(in_devset_file) evalset = pd.read_json(in_evalset_file) eval_utterances = list(map(lambda x: x.lower().split(), evalset.utterance)) with tf.Session() as sess: ae = CompatibleRNNAutoencoder.load(in_model_folder, sess) rev_vocab, config = ae.vocab, ae.config vocab = {word: idx for idx, word in enumerate(rev_vocab)} dev_enc_inp, _, dev_dec_out, _ = make_variational_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) eval_enc_inp, _, eval_dec_out, _ = make_variational_autoencoder_dataset(eval_utterances, vocab, config['max_sequence_length']) ae_ood = AEOODDetector(ae) ae_ood.tune_threshold((dev_enc_inp, dev_dec_out), sess, in_decision_type) print('Detector accuracy on the evalset: {:.3f}'.format(evaluate(sess, ae_ood, (eval_enc_inp, eval_dec_out, evalset.label))))
def main(in_model_folder, in_devset_file, in_evalset_file, in_decision_type): dev_utterances = load_txt(in_devset_file) evalset = pd.read_json(in_evalset_file) eval_utterances = list(map(lambda x: x.split(), evalset.utterance)) with tf.Session() as sess: vae = RNNVAE.load(in_model_folder, sess) rev_vocab, config = vae.vocab, vae.config vocab = {word: idx for idx, word in enumerate(rev_vocab)} dev_data = make_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) eval_data = make_autoencoder_dataset(eval_utterances, vocab, config['max_sequence_length']) vae_ood = VAEOODDetector(vae) vae_ood.tune_threshold(dev_data, sess, in_decision_type) print('Detector accuracy on the evalset: {:.3f}'.format(evaluate(sess, vae_ood, (eval_data, evalset.label))))
def main(in_model_folder, in_devset_file, in_testset_file, in_decision_type): dev_utterances = load_txt(in_devset_file) testset = pd.read_json(in_testset_file) test_utterances = list(map(lambda x: x.split(), testset.utterance)) with tf.Session() as sess: ae = RNNAutoencoder.load(in_model_folder, sess) rev_vocab, config = ae.vocab, ae.config vocab = {word: idx for idx, word in enumerate(rev_vocab)} dev_X, dev_masks = make_dataset(dev_utterances, vocab, config['max_sequence_length']) test_X, test_masks = make_dataset(test_utterances, vocab, config['max_sequence_length']) ae_ood = AEOODDetector(ae) ae_ood.tune_threshold((dev_X, dev_masks), sess, in_decision_type) print('Decision threshold: {:.3f}'.format(ae_ood.threshold)) print('Utterance\tloss\tprediction') losses, predictions = predict(sess, ae_ood, (test_X, test_masks, testset.label)) for utterance, loss, prediction in zip(test_utterances, losses, predictions): print('{}\t{:.3f}\t{}'.format(' '.join(utterance), loss, prediction))