def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder, in_config_file, in_custom_vocab): with open(in_config_file) as config_in: config = json.load(config_in) train_utterances = load_txt(in_trainset_file) dev_utterances = load_txt(in_devset_file) test_utterances = load_txt(in_testset_file) if in_custom_vocab is not None: with open(in_custom_vocab) as vocab_in: rev_vocab = [line.rstrip() for line in vocab_in] vocab = {word: idx for idx, word in enumerate(rev_vocab)} else: vocab, rev_vocab = make_vocabulary( train_utterances, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS]) config['vocabulary_size'] = len(vocab) train_enc_inp, _, train_dec_out, _ = make_variational_autoencoder_dataset( train_utterances, vocab, config['max_sequence_length']) dev_enc_inp, _, dev_dec_out, _ = make_variational_autoencoder_dataset( dev_utterances, vocab, config['max_sequence_length']) test_enc_inp, _, test_dec_out, _ = make_variational_autoencoder_dataset( test_utterances, vocab, config['max_sequence_length']) with tf.Session() as sess: ae = CompatibleRNNAutoencoder(config, rev_vocab) sess.run(tf.global_variables_initializer()) train(sess, ae, (train_enc_inp, train_dec_out), (dev_enc_inp, dev_dec_out), in_model_folder, **config)
def main(in_dataset_folder, in_noisy_dataset_folder, in_custom_vocab_file, in_model_folder, in_config): with open(in_config, encoding='utf-8') as config_in: config = json.load(config_in) train_dialogs, train_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'), with_indices=True) dev_dialogs, dev_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'), with_indices=True) test_dialogs, test_indices = read_dialogs(os.path.join(in_noisy_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'), with_indices=True) kb = make_augmented_knowledge_base(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'), os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt')) max_noisy_dialog_length = max([item['end'] - item['start'] + 1 for item in test_indices]) config['max_input_length'] = max_noisy_dialog_length post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(test_dialogs, BABI_CONFIG['backoff_utterance'].lower()) et = EntityTracker(kb) at = ActionTracker(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'), et) if in_custom_vocab_file is not None: with open(in_custom_vocab_file) as vocab_in: rev_vocab = [line.rstrip() for line in vocab_in] vocab = {word: idx for idx, word in enumerate(rev_vocab)} else: utterances_tokenized = [] for dialog in train_dialogs: utterances_tokenized += list(map(lambda x: x.split(), dialog)) vocab, rev_vocab = make_vocabulary(utterances_tokenized, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS] + list(kb.keys())) config['vocabulary_size'] = len(vocab) data_train = make_dataset_for_hierarchical_hcn(train_dialogs, train_indices, vocab, et, at, **config) data_dev = make_dataset_for_hierarchical_hcn(dev_dialogs, dev_indices, vocab, et, at, **config) data_test = make_dataset_for_hierarchical_hcn(test_dialogs, test_indices, vocab, et, at, **config) random_input = generate_dropout_turns_for_hierarchical_hcn(10000, config['max_sequence_length'], [utterance[0] for utterance in train_dialogs], vocab, config['turn_word_dropout_prob']) save_model(rev_vocab, config, kb, at.action_templates, in_model_folder) trainer = Trainer(data_train, data_dev, data_test, at.action_templates, random_input, post_ood_turns_noisy, config, vocab, in_model_folder) trainer.train()
def main(in_dataset_folder, in_custom_vocab_file, in_model_folder, in_config): with open(in_config, encoding='utf-8') as config_in: config = json.load(config_in) train_dialogs, train_indices = read_dialogs(os.path.join( in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'), with_indices=True) dev_dialogs, dev_indices = read_dialogs(os.path.join( in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'), with_indices=True) test_dialogs, test_indices = read_dialogs(os.path.join( in_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'), with_indices=True) kb = make_augmented_knowledge_base( os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'), os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt')) et = EntityTracker(kb) at = ActionTracker( os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'), et) if in_custom_vocab_file is not None: with open(in_custom_vocab_file) as vocab_in: rev_vocab = [line.rstrip() for line in vocab_in] vocab = {word: idx for idx, word in enumerate(rev_vocab)} else: utterances_tokenized = [] for dialog in train_dialogs: utterances_tokenized += list(map(lambda x: x.split(), dialog)) vocab, rev_vocab = make_vocabulary( utterances_tokenized, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS] + list(kb.keys())) config['vocabulary_size'] = len(vocab) data_train = make_dataset_for_variational_hcn(train_dialogs, train_indices, vocab, et, at, **config) data_dev = make_dataset_for_variational_hcn(dev_dialogs, dev_indices, vocab, et, at, **config) data_test = make_dataset_for_variational_hcn(test_dialogs, test_indices, vocab, et, at, **config) random_input = generate_random_input_for_variational_hcn( 10000, config['max_sequence_length'], vocab, rev_vocab) save_model(rev_vocab, config, kb, at.action_templates, in_model_folder) trainer = Trainer(data_train, data_dev, data_test, at.action_templates, random_input, rev_vocab, config, in_model_folder) trainer.train()
def main(in_dataset_folder, in_result_file, in_config): with open(in_config, encoding='utf-8') as config_in: config = json.load(config_in) # *_ood.json files contain both IND and OOD words train_dialogs, train_indices = read_dialogs(os.path.join(in_dataset_folder, 'train_ood.json')) dev_dialogs, dev_indices = read_dialogs(os.path.join(in_dataset_folder, 'dev_ood.json')) test_dialogs, test_indices = read_dialogs(os.path.join(in_dataset_folder, 'test_ood.json')) utterances_tokenized = [] for input_utterance, action_id in train_dialogs + dev_dialogs + test_dialogs: utterances_tokenized.append(input_utterance.split()) print(utterances_tokenized[:10]) vocab, rev_vocab = make_vocabulary(utterances_tokenized, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS]) with open(in_result_file, 'w') as vocab_out: for word in rev_vocab: print(word, file=vocab_out)
def main(in_trainset_file, in_devset_file, in_testset_file, in_config, in_model_folder): train_utterances, dev_utterances, test_utterances = load_txt( in_trainset_file), load_txt(in_devset_file), load_txt(in_testset_file) vocab, rev_vocab = make_vocabulary(train_utterances, in_config['max_vocabulary_size'], frequency_threshold=0, ngram_sizes=(1, )) config['vocabulary_size'] = len(vocab) train_X = make_variational_autoencoder_dataset( train_utterances, vocab, config['max_sequence_length']) dev_X = make_variational_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) test_X = make_variational_autoencoder_dataset( test_utterances, vocab, config['max_sequence_length']) # save_model(vocab, config, in_model_folder) with tf.Session() as sess: model = VRAE(config, rev_vocab, sess, standalone=True) train(model, train_X, dev_X, config, in_model_folder)
def main(in_trainset_file, in_devset_file, in_testset_file, in_model_folder, in_config_file): with open(in_config_file) as config_in: config = json.load(config_in) train_utterances = load_txt(in_trainset_file) dev_utterances = load_txt(in_devset_file) test_utterances = load_txt(in_testset_file) vocab, rev_vocab = make_vocabulary(train_utterances, config['max_vocabulary_size']) config['vocabulary_size'] = len(vocab) train_data = make_autoencoder_dataset(train_utterances, vocab, config['max_sequence_length']) dev_data = make_autoencoder_dataset(dev_utterances, vocab, config['max_sequence_length']) test_data = make_autoencoder_dataset(test_utterances, vocab, config['max_sequence_length']) with tf.Session() as sess: ae = RNNVAE(config, rev_vocab) sess.run(tf.global_variables_initializer()) train(sess, ae, train_data, dev_data, in_model_folder, **config)
def main(in_dataset_folder, in_noisy_dataset_folder, in_custom_vocab_file, in_model_folder, in_config): with open(in_config, encoding='utf-8') as config_in: config = json.load(config_in) train_json = load_hcn_json(os.path.join(in_dataset_folder, 'train.json')) dev_json = load_hcn_json(os.path.join(in_dataset_folder, 'dev.json')) # test_json = load_hcn_json(os.path.join(in_dataset_folder, 'test.json')) test_ood_json = load_hcn_json( os.path.join(in_noisy_dataset_folder, 'test_ood.json')) kb = make_augmented_knowledge_base( os.path.join(BABI_FOLDER, 'dialog-babi-task6-dstc2-kb.txt'), os.path.join(BABI_FOLDER, 'dialog-babi-task6-dstc2-candidates.txt')) action_templates = train_json['actions'] max_noisy_dialog_length = max( [len(dialog['turns']) for dialog in test_ood_json['dialogs']]) config['max_input_length'] = max_noisy_dialog_length et = EntityTracker(kb) post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns( test_ood_json) if in_custom_vocab_file is not None: with open(in_custom_vocab_file) as vocab_in: rev_vocab = [line.rstrip() for line in vocab_in] vocab = {word: idx for idx, word in enumerate(rev_vocab)} else: utterances_tokenized = [] for dialog in train_json['dialogs']: for utterance in dialog['turns']: utterances_tokenized.append(utterance['input'].split()) vocab, rev_vocab = make_vocabulary( utterances_tokenized, config['max_vocabulary_size'], special_tokens=[PAD, START, UNK, EOS] + list(kb.keys())) ctx_features = [] for dialog in train_json['dialogs']: for utterance in dialog['turns']: if 'context_features' in utterance: ctx_features.append(utterance['context_features']) ctx_features_vocab, ctx_features_rev_vocab = make_vocabulary( ctx_features, config['max_vocabulary_size'], special_tokens=[]) config['vocabulary_size'] = len(vocab) print('Training with config: {}'.format(json.dumps(config))) data_preparation_function = getattr(utils.preprocessing, config['data_preparation_function']) data_train = data_preparation_function(train_json, vocab, ctx_features_vocab, et, **config) data_dev = data_preparation_function(dev_json, vocab, ctx_features_vocab, et, **config) # data_test = data_preparation_function(test_json, vocab, ctx_features_vocab, et, **config) data_test_ood = data_preparation_function(test_ood_json, vocab, ctx_features_vocab, et, **config) dropout_turn_generation_function = getattr( utils.preprocessing, config['dropout_turn_generation_function']) random_input = dropout_turn_generation_function( 10000, 3, config['max_sequence_length'], train_json, vocab, config['turn_word_dropout_prob']) save_model(rev_vocab, config, kb, action_templates, in_model_folder) net = getattr(modules, config['model_name'])(vocab, config, len(ctx_features_vocab), len(action_templates)) trainer = Trainer(data_train, data_dev, data_test_ood, action_templates, random_input, post_ood_turns_noisy, config, net, in_model_folder) trainer.train()
def main(in_clean_dataset_folder, in_noisy_dataset_folder, in_model_folder, in_mode, in_runs_number): rev_vocab, kb, action_templates, config = load_model(in_model_folder) train_json = load_hcn_json( os.path.join(in_clean_dataset_folder, 'train.json')) test_json = load_hcn_json( os.path.join(in_clean_dataset_folder, 'test.json')) test_ood_json = load_hcn_json( os.path.join(in_noisy_dataset_folder, 'test_ood.json')) max_noisy_dialog_length = max( [len(dialog['turns']) for dialog in test_ood_json['dialogs']]) config['max_input_length'] = max_noisy_dialog_length post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns( test_ood_json) et = EntityTracker(kb) action_weights = defaultdict(lambda: 1.0) action_weights[0] = 0.0 action_weighting = np.vectorize(action_weights.__getitem__) vocab = {word: idx for idx, word in enumerate(rev_vocab)} ctx_features = [] for dialog in train_json['dialogs']: for utterance in dialog['turns']: if 'context_features' in utterance: ctx_features.append(utterance['context_features']) ctx_features_vocab, ctx_features_rev_vocab = make_vocabulary( ctx_features, config['max_vocabulary_size'], special_tokens=[]) data_preparation_function = getattr(utils.preprocessing, config['data_preparation_function']) data_clean = data_preparation_function(test_json, vocab, ctx_features_vocab, et, **config) data_noisy = data_preparation_function(test_ood_json, vocab, ctx_features_vocab, et, **config) data_clean_with_weights = *data_clean, action_weighting(data_clean[-1]) data_noisy_with_weights = *data_noisy, action_weighting(data_noisy[-1]) net = getattr(modules, config['model_name'])(vocab, config, len(ctx_features_vocab), len(action_templates)) net.restore(in_model_folder) if in_mode == 'clean': eval_stats_clean = evaluate_advanced( net, data_clean_with_weights, action_templates, BABI_CONFIG['backoff_utterance'], post_ood_turns=post_ood_turns_clean, runs_number=in_runs_number) print('Clean dataset: {} turns overall'.format( eval_stats_clean['total_turns'])) print('Accuracy:') accuracy = eval_stats_clean['correct_turns'] / eval_stats_clean[ 'total_turns'] accuracy_continuous = eval_stats_clean[ 'correct_continuous_turns'] / eval_stats_clean['total_turns'] accuracy_post_ood = eval_stats_clean['correct_post_ood_turns'] / eval_stats_clean['total_post_ood_turns'] \ if eval_stats_clean['total_post_ood_turns'] != 0 \ else 0 ood_f1 = eval_stats_clean['ood_f1'] print('overall acc: {:.3f}; continuous acc: {:.3f}; ' 'directly post-OOD acc: {:.3f}; OOD F1: {:.3f}'.format( accuracy, accuracy_continuous, accuracy_post_ood, ood_f1)) print('Loss : {:.3f}'.format(eval_stats_clean['avg_loss'])) elif in_mode == 'noisy': eval_stats_noisy = evaluate_advanced( net, data_noisy_with_weights, action_templates, BABI_CONFIG['backoff_utterance'], post_ood_turns=post_ood_turns_noisy, runs_number=in_runs_number) print('\n\n') print('Noisy dataset: {} turns overall'.format( eval_stats_noisy['total_turns'])) print('Accuracy:') accuracy = eval_stats_noisy['correct_turns'] / eval_stats_noisy[ 'total_turns'] accuracy_continuous = eval_stats_noisy[ 'correct_continuous_turns'] / eval_stats_noisy['total_turns'] accuracy_post_ood = eval_stats_noisy['correct_post_ood_turns'] / eval_stats_noisy['total_post_ood_turns'] \ if eval_stats_noisy['total_post_ood_turns'] != 0 \ else 0 accuracy_ood = eval_stats_noisy['correct_ood_turns'] / eval_stats_noisy['total_ood_turns'] \ if eval_stats_noisy['total_ood_turns'] != 0 \ else 0 ood_f1 = eval_stats_noisy['ood_f1'] print('overall acc: {:.3f}; continuous acc: {:.3f}; ' 'directly post-OOD acc: {:.3f}; OOD acc: {:.3f}; OOD F1: {:.3f}'. format(accuracy, accuracy_continuous, accuracy_post_ood, accuracy_ood, ood_f1)) print('Loss : {:.3f}'.format(eval_stats_noisy['avg_loss'])) elif in_mode == 'noisy_ignore_ood': eval_stats_no_ood = evaluate_advanced( net, data_noisy_with_weights, action_templates, BABI_CONFIG['backoff_utterance'], post_ood_turns=post_ood_turns_noisy, ignore_ood_accuracy=True, runs_number=in_runs_number) print('Accuracy (OOD turns ignored):') accuracy = eval_stats_no_ood['correct_turns'] / eval_stats_no_ood[ 'total_turns'] accuracy_after_ood = eval_stats_no_ood['correct_turns_after_ood'] / eval_stats_no_ood['total_turns_after_ood'] \ if eval_stats_no_ood['total_turns_after_ood'] != 0 \ else 0 accuracy_post_ood = eval_stats_no_ood['correct_post_ood_turns'] / eval_stats_no_ood['total_post_ood_turns'] \ if eval_stats_no_ood['total_post_ood_turns'] != 0 \ else 0 print('overall: {:.3f}; ' 'after first OOD: {:.3f}; ' 'directly post-OOD: {:.3f}'.format(accuracy, accuracy_after_ood, accuracy_post_ood))