コード例 #1
0
    def prepare_data(self, type=None):
        # get dialogs from file
        if type == 'Train':
            dialogs, dialog_indices = util.read_dialogs(
                with_indices=True,
                file_name=
                '/root/jude/data/dialog-bAbI-tasks/dialog-babi-task6-dstc2-trn.txt',
                babi_num=6)
        elif type == 'Test':
            dialogs, dialog_indices = util.read_dialogs(
                with_indices=True,
                file_name=
                '/root/jude/data/dialog-bAbI-tasks/dialog-babi-task6-dstc2-tst.txt',
                babi_num=6)
        # get utterances
        utterances = util.get_utterances(dialogs)
        # get responses
        responses = util.get_responses(dialogs)
        responses = [self.get_template_id(response) for response in responses]

        trainset = []
        for u, r in zip(utterances, responses):
            trainset.append((u, r))

        return trainset, dialog_indices
コード例 #2
0
def main(in_dataset_folder, in_noisy_dataset_folder, in_custom_vocab_file, in_model_folder, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)
    train_dialogs, train_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'),
                                                with_indices=True)
    dev_dialogs, dev_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'),
                                            with_indices=True)
    test_dialogs, test_indices = read_dialogs(os.path.join(in_noisy_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                              with_indices=True)

    kb = make_augmented_knowledge_base(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'),
                                       os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'))

    max_noisy_dialog_length = max([item['end'] - item['start'] + 1 for item in test_indices])
    config['max_input_length'] = max_noisy_dialog_length
    post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(test_dialogs, BABI_CONFIG['backoff_utterance'].lower())

    et = EntityTracker(kb)
    at = ActionTracker(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'), et)

    if in_custom_vocab_file is not None:
        with open(in_custom_vocab_file) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        utterances_tokenized = []
        for dialog in train_dialogs:
            utterances_tokenized += list(map(lambda x: x.split(), dialog))

        vocab, rev_vocab = make_vocabulary(utterances_tokenized,
                                           config['max_vocabulary_size'],
                                           special_tokens=[PAD, START, UNK, EOS] + list(kb.keys()))
    config['vocabulary_size'] = len(vocab)

    data_train = make_dataset_for_hierarchical_hcn(train_dialogs, train_indices, vocab, et, at, **config)
    data_dev = make_dataset_for_hierarchical_hcn(dev_dialogs, dev_indices, vocab, et, at, **config)
    data_test = make_dataset_for_hierarchical_hcn(test_dialogs, test_indices, vocab, et, at, **config)

    random_input = generate_dropout_turns_for_hierarchical_hcn(10000,
                                                               config['max_sequence_length'],
                                                               [utterance[0] for utterance in train_dialogs],
                                                               vocab,
                                                               config['turn_word_dropout_prob'])

    save_model(rev_vocab, config, kb, at.action_templates, in_model_folder)
    trainer = Trainer(data_train,
                      data_dev,
                      data_test,
                      at.action_templates,
                      random_input,
                      post_ood_turns_noisy,
                      config,
                      vocab,
                      in_model_folder)
    trainer.train()
def main(in_dataset_folder, in_custom_vocab_file, in_model_folder, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)

    train_dialogs, train_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'),
                                                with_indices=True)
    dev_dialogs, dev_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'),
                                            with_indices=True)
    test_dialogs, test_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                              with_indices=True)

    kb = make_augmented_knowledge_base(
        os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'),
        os.path.join(in_dataset_folder,
                     'dialog-babi-task6-dstc2-candidates.txt'))

    et = EntityTracker(kb)
    at = ActionTracker(
        os.path.join(in_dataset_folder,
                     'dialog-babi-task6-dstc2-candidates.txt'), et)

    if in_custom_vocab_file is not None:
        with open(in_custom_vocab_file) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        utterances_tokenized = []
        for dialog in train_dialogs:
            utterances_tokenized += list(map(lambda x: x.split(), dialog))

        vocab, rev_vocab = make_vocabulary(
            utterances_tokenized,
            config['max_vocabulary_size'],
            special_tokens=[PAD, START, UNK, EOS] + list(kb.keys()))
    config['vocabulary_size'] = len(vocab)

    data_train = make_dataset_for_variational_hcn(train_dialogs, train_indices,
                                                  vocab, et, at, **config)
    data_dev = make_dataset_for_variational_hcn(dev_dialogs, dev_indices,
                                                vocab, et, at, **config)
    data_test = make_dataset_for_variational_hcn(test_dialogs, test_indices,
                                                 vocab, et, at, **config)

    random_input = generate_random_input_for_variational_hcn(
        10000, config['max_sequence_length'], vocab, rev_vocab)

    save_model(rev_vocab, config, kb, at.action_templates, in_model_folder)
    trainer = Trainer(data_train, data_dev, data_test, at.action_templates,
                      random_input, rev_vocab, config, in_model_folder)
    trainer.train()
コード例 #4
0
def main(in_dataset_folder, in_model_folder, in_no_ood_evaluation):
    rev_vocab, kb, action_templates, config = load_model(in_model_folder)
    test_dialogs, test_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                              with_indices=True)
    et = EntityTracker(kb)
    at = ActionTracker(None, et)
    at.set_action_templates(action_templates)

    vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    X, context_features, action_masks, y = make_dataset_for_hierarchical_lstm(
        test_dialogs, test_indices, vocab, et, at, **config)

    net = HierarchicalLSTM(config, context_features.shape[-1],
                           action_masks.shape[-1])
    net.restore(in_model_folder)
    eval_stats_full_dataset = evaluate_advanced(
        net, (X, context_features, action_masks, y), test_dialogs,
        at.action_templates)
    print(
        'Full dataset: {} turns overall, {} turns after the first OOD'.format(
            eval_stats_full_dataset['total_turns'],
            eval_stats_full_dataset['total_turns_after_ood']))
    print('Accuracy:')
    accuracy = eval_stats_full_dataset[
        'correct_turns'] / eval_stats_full_dataset['total_turns']
    accuracy_after_ood = eval_stats_full_dataset['correct_turns_after_ood'] / eval_stats_full_dataset['total_turns_after_ood'] \
        if eval_stats_full_dataset['total_turns_after_ood'] != 0 \
        else 0
    accuracy_post_ood = eval_stats_full_dataset['correct_post_ood_turns'] / eval_stats_full_dataset['total_post_ood_turns'] \
        if eval_stats_full_dataset['total_post_ood_turns'] != 0 \
        else 0
    print(
        'overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}'.
        format(accuracy, accuracy_after_ood, accuracy_post_ood))
    print('Loss : {:.3f}'.format(eval_stats_full_dataset['avg_loss']))

    if in_no_ood_evaluation:
        eval_stats_no_ood = evaluate_advanced(
            net, (X, context_features, action_masks, y),
            test_indices,
            at.action_templates,
            ignore_ood_accuracy=True)
        print('Accuracy (OOD turns ignored):')
        accuracy = eval_stats_no_ood['correct_turns'] / eval_stats_no_ood[
            'total_turns']
        accuracy_after_ood = eval_stats_no_ood['correct_turns_after_ood'] / eval_stats_no_ood['total_turns_after_ood'] \
            if eval_stats_no_ood['total_turns_after_ood'] != 0 \
            else 0
        accuracy_post_ood = eval_stats_no_ood['correct_post_ood_turns'] / eval_stats_no_ood['total_post_ood_turns'] \
            if eval_stats_no_ood['total_post_ood_turns'] != 0 \
            else 0
        print(
            'overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}'
            .format(accuracy, accuracy_after_ood, accuracy_post_ood))
        print('Loss : {:.3f}'.format(eval_stats_no_ood['avg_loss']))
コード例 #5
0
ファイル: data_utils.py プロジェクト: cp4011/SL_Hospital
    def prepare_data(self):
        # get dialogs from file
        dialogs, dialog_indices = util.read_dialogs(with_indices=True)
        # get utterances
        utterances = util.get_utterances(dialogs)
        # get responses
        responses_id = util.get_responses()

        trainset = []
        for u, r in zip(utterances, responses_id):
            trainset.append((u, int(r)-1))

        return trainset, dialog_indices     # [(utterance_1, action_template_id_1),..]  [{'start':0, 'end':20},...]
コード例 #6
0
    def prepare_data(self):
        # get dialogs from file
        dialogs, dialog_indices = util.read_dialogs(with_indices=True)
        # get utterances
        utterances = util.get_utterances(dialogs)
        # get responses
        responses = util.get_responses(dialogs)
        responses = [self.get_template_id(response) for response in responses]

        trainset = []
        for u, r in zip(utterances, responses):
            trainset.append((u, r))

        return trainset, dialog_indices
コード例 #7
0
def main(in_clean_dataset_folder, in_noisy_dataset_folder, in_model_folder, in_mode, in_runs_number):
    rev_vocab, kb, action_templates, config = load_model(in_model_folder)
    clean_dialogs, clean_indices = read_dialogs(os.path.join(in_clean_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                                with_indices=True)
    noisy_dialogs, noisy_indices = read_dialogs(os.path.join(in_noisy_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                                with_indices=True)

    max_noisy_dialog_length = max([item['end'] - item['start'] + 1 for item in noisy_indices])
    config['max_input_length'] = max_noisy_dialog_length
    post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(noisy_dialogs, BABI_CONFIG['backoff_utterance'].lower())

    et = EntityTracker(kb)
    at = ActionTracker(None, et)
    at.set_action_templates(action_templates)

    vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    data_clean = make_dataset_for_vhcn_v2(clean_dialogs, clean_indices, vocab, et, at, **config)
    data_noisy = make_dataset_for_vhcn_v2(noisy_dialogs, noisy_indices, vocab, et, at, **config)

    context_features_clean, action_masks_clean = data_clean[2:4]
    net = VariationalHierarchicalLSTMv3(rev_vocab, config, context_features_clean.shape[-1], action_masks_clean.shape[-1])
    net.restore(in_model_folder)

    if in_mode == 'clean':
        eval_stats_clean = evaluate_advanced(net,
                                             data_clean,
                                             at.action_templates,
                                             BABI_CONFIG['backoff_utterance'].lower(),
                                             post_ood_turns=post_ood_turns_clean,
                                             runs_number=in_runs_number)
        print('Clean dataset: {} turns overall'.format(eval_stats_clean['total_turns']))
        print('Accuracy:')
        accuracy = eval_stats_clean['correct_turns'] / eval_stats_clean['total_turns']
        accuracy_continuous = eval_stats_clean['correct_continuous_turns'] / eval_stats_clean['total_turns']
        accuracy_post_ood = eval_stats_clean['correct_post_ood_turns'] / eval_stats_clean['total_post_ood_turns'] \
            if eval_stats_clean['total_post_ood_turns'] != 0 \
            else 0
        print('overall: {:.3f}; continuous: {:.3f}; directly post-OOD: {:.3f}'.format(accuracy, accuracy_continuous, accuracy_post_ood))
        print('Loss : {:.3f}'.format(eval_stats_clean['avg_loss']))
    elif in_mode == 'noisy':
        eval_stats_noisy = evaluate_advanced(net,
                                             data_noisy, 
                                             at.action_templates,
                                             BABI_CONFIG['backoff_utterance'].lower(),
                                             post_ood_turns=post_ood_turns_noisy,
                                             runs_number=in_runs_number)
        print('\n\n')
        print('Noisy dataset: {} turns overall'.format(eval_stats_noisy['total_turns']))
        print('Accuracy:')
        accuracy = eval_stats_noisy['correct_turns'] / eval_stats_noisy['total_turns']
        accuracy_continuous = eval_stats_noisy['correct_continuous_turns'] / eval_stats_noisy['total_turns']
        accuracy_post_ood = eval_stats_noisy['correct_post_ood_turns'] / eval_stats_noisy['total_post_ood_turns'] \
            if eval_stats_noisy['total_post_ood_turns'] != 0 \
            else 0
        accuracy_ood = eval_stats_noisy['correct_ood_turns'] / eval_stats_noisy['total_ood_turns'] \
            if eval_stats_noisy['total_ood_turns'] != 0 \
            else 0
        print('overall: {:.3f}; continuous: {:.3f}; directly post-OOD: {:.3f}; OOD: {:.3f}'.format(accuracy,
                                                                                                   accuracy_continuous,
                                                                                                   accuracy_post_ood,
                                                                                                   accuracy_ood))
        print('Loss : {:.3f}'.format(eval_stats_noisy['avg_loss']))
    elif in_mode == 'noisy_ignore_ood':
        eval_stats_no_ood = evaluate_advanced(net,
                                              data_noisy,
                                              at.action_templates,
                                              BABI_CONFIG['backoff_utterance'].lower(),
                                              post_ood_turns=post_ood_turns_noisy,
                                              ignore_ood_accuracy=True,
                                              runs_number=in_runs_number)
        print('Accuracy (OOD turns ignored):')
        accuracy = eval_stats_no_ood['correct_turns'] / eval_stats_no_ood['total_turns']
        accuracy_after_ood = eval_stats_no_ood['correct_turns_after_ood'] / eval_stats_no_ood['total_turns_after_ood'] \
            if eval_stats_no_ood['total_turns_after_ood'] != 0 \
            else 0
        accuracy_post_ood = eval_stats_no_ood['correct_post_ood_turns'] / eval_stats_no_ood['total_post_ood_turns'] \
            if eval_stats_no_ood['total_post_ood_turns'] != 0 \
            else 0
        print('overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}'.format(accuracy, accuracy_after_ood, accuracy_post_ood))
def main(in_clean_dataset_folder, in_noisy_dataset_folder, in_model_folder,
         in_no_ood_evaluation):
    rev_vocab, kb, action_templates, config = load_model(in_model_folder)
    clean_dialogs, clean_indices = read_dialogs(os.path.join(
        in_clean_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                                with_indices=True)
    noisy_dialogs, noisy_indices = read_dialogs(os.path.join(
        in_noisy_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                                with_indices=True)

    post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(
        noisy_dialogs)

    assert len(post_ood_turns_clean) == len(post_ood_turns_noisy)

    for post_ood_turn_clean, post_ood_turn_noisy in zip(
            sorted(post_ood_turns_clean), sorted(post_ood_turns_noisy)):
        noisy_dialogs[post_ood_turn_noisy][0] = clean_dialogs[
            post_ood_turn_clean][0]
    et = EntityTracker(kb)
    at = ActionTracker(None, et)
    at.set_action_templates(action_templates)

    vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    X_clean, context_features_clean, action_masks_clean, y_clean = make_dataset_for_hierarchical_lstm(
        clean_dialogs, clean_indices, vocab, et, at, **config)
    X_noisy, context_features_noisy, action_masks_noisy, y_noisy = make_dataset_for_hierarchical_lstm(
        noisy_dialogs, noisy_indices, vocab, et, at, **config)

    net = HierarchicalLSTM(config, context_features_clean.shape[-1],
                           action_masks_clean.shape[-1])
    net.restore(in_model_folder)
    eval_stats_clean = evaluate_advanced(
        net, (X_clean, context_features_clean, action_masks_clean, y_clean),
        at.action_templates,
        post_ood_turns=post_ood_turns_clean)
    print('Clean dataset: {} turns overall'.format(
        eval_stats_clean['total_turns']))
    print('Accuracy:')
    accuracy = eval_stats_clean['correct_turns'] / eval_stats_clean[
        'total_turns']
    accuracy_post_ood = eval_stats_clean['correct_post_ood_turns'] / eval_stats_clean['total_post_ood_turns'] \
        if eval_stats_clean['total_post_ood_turns'] != 0 \
        else 0
    print('overall: {:.3f}; directly post-OOD: {:.3f}'.format(
        accuracy, accuracy_post_ood))
    print('Loss : {:.3f}'.format(eval_stats_clean['avg_loss']))

    eval_stats_noisy = evaluate_advanced(
        net, (X_noisy, context_features_noisy, action_masks_noisy, y_noisy),
        at.action_templates,
        post_ood_turns=post_ood_turns_noisy)
    print('\n\n')
    print(
        'Noisy dataset: {} turns overall, {} turns after the first OOD'.format(
            eval_stats_noisy['total_turns'],
            eval_stats_noisy['total_turns_after_ood']))
    print('Accuracy:')
    accuracy = eval_stats_noisy['correct_turns'] / eval_stats_noisy[
        'total_turns']
    accuracy_after_ood = eval_stats_noisy['correct_turns_after_ood'] / eval_stats_noisy['total_turns_after_ood'] \
        if eval_stats_noisy['total_turns_after_ood'] != 0 \
        else 0
    accuracy_post_ood = eval_stats_noisy['correct_post_ood_turns'] / eval_stats_noisy['total_post_ood_turns'] \
        if eval_stats_noisy['total_post_ood_turns'] != 0 \
        else 0
    print(
        'overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}'.
        format(accuracy, accuracy_after_ood, accuracy_post_ood))
    print('Loss : {:.3f}'.format(eval_stats_noisy['avg_loss']))

    if in_no_ood_evaluation:
        eval_stats_no_ood = evaluate_advanced(
            net,
            (X_noisy, context_features_noisy, action_masks_noisy, y_noisy),
            at.action_templates,
            post_ood_turns=post_ood_turns_noisy,
            ignore_ood_accuracy=True)
        print('Accuracy (OOD turns ignored):')
        accuracy = eval_stats_no_ood['correct_turns'] / eval_stats_no_ood[
            'total_turns']
        accuracy_after_ood = eval_stats_no_ood['correct_turns_after_ood'] / eval_stats_no_ood['total_turns_after_ood'] \
            if eval_stats_no_ood['total_turns_after_ood'] != 0 \
            else 0
        accuracy_post_ood = eval_stats_no_ood['correct_post_ood_turns'] / eval_stats_no_ood['total_post_ood_turns'] \
            if eval_stats_no_ood['total_post_ood_turns'] != 0 \
            else 0
        print(
            'overall: {:.3f}; after first OOD: {:.3f}, directly post-OOD: {:.3f}'
            .format(accuracy, accuracy_after_ood, accuracy_post_ood))