def main(in_dataset_folder, in_noisy_dataset_folder, in_custom_vocab_file, in_model_folder, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)
    train_dialogs, train_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'),
                                                with_indices=True)
    dev_dialogs, dev_indices = read_dialogs(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'),
                                            with_indices=True)
    test_dialogs, test_indices = read_dialogs(os.path.join(in_noisy_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                              with_indices=True)

    kb = make_augmented_knowledge_base(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'),
                                       os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'))

    max_noisy_dialog_length = max([item['end'] - item['start'] + 1 for item in test_indices])
    config['max_input_length'] = max_noisy_dialog_length
    post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(test_dialogs, BABI_CONFIG['backoff_utterance'].lower())

    et = EntityTracker(kb)
    at = ActionTracker(os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-candidates.txt'), et)

    if in_custom_vocab_file is not None:
        with open(in_custom_vocab_file) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        utterances_tokenized = []
        for dialog in train_dialogs:
            utterances_tokenized += list(map(lambda x: x.split(), dialog))

        vocab, rev_vocab = make_vocabulary(utterances_tokenized,
                                           config['max_vocabulary_size'],
                                           special_tokens=[PAD, START, UNK, EOS] + list(kb.keys()))
    config['vocabulary_size'] = len(vocab)

    data_train = make_dataset_for_hierarchical_hcn(train_dialogs, train_indices, vocab, et, at, **config)
    data_dev = make_dataset_for_hierarchical_hcn(dev_dialogs, dev_indices, vocab, et, at, **config)
    data_test = make_dataset_for_hierarchical_hcn(test_dialogs, test_indices, vocab, et, at, **config)

    random_input = generate_dropout_turns_for_hierarchical_hcn(10000,
                                                               config['max_sequence_length'],
                                                               [utterance[0] for utterance in train_dialogs],
                                                               vocab,
                                                               config['turn_word_dropout_prob'])

    save_model(rev_vocab, config, kb, at.action_templates, in_model_folder)
    trainer = Trainer(data_train,
                      data_dev,
                      data_test,
                      at.action_templates,
                      random_input,
                      post_ood_turns_noisy,
                      config,
                      vocab,
                      in_model_folder)
    trainer.train()
def main(in_dataset_folder, in_custom_vocab_file, in_model_folder, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)

    train_dialogs, train_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-trn.txt'),
                                                with_indices=True)
    dev_dialogs, dev_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-dev.txt'),
                                            with_indices=True)
    test_dialogs, test_indices = read_dialogs(os.path.join(
        in_dataset_folder, 'dialog-babi-task6-dstc2-tst.txt'),
                                              with_indices=True)

    kb = make_augmented_knowledge_base(
        os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'),
        os.path.join(in_dataset_folder,
                     'dialog-babi-task6-dstc2-candidates.txt'))

    et = EntityTracker(kb)
    at = ActionTracker(
        os.path.join(in_dataset_folder,
                     'dialog-babi-task6-dstc2-candidates.txt'), et)

    if in_custom_vocab_file is not None:
        with open(in_custom_vocab_file) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        utterances_tokenized = []
        for dialog in train_dialogs:
            utterances_tokenized += list(map(lambda x: x.split(), dialog))

        vocab, rev_vocab = make_vocabulary(
            utterances_tokenized,
            config['max_vocabulary_size'],
            special_tokens=[PAD, START, UNK, EOS] + list(kb.keys()))
    config['vocabulary_size'] = len(vocab)

    data_train = make_dataset_for_variational_hcn(train_dialogs, train_indices,
                                                  vocab, et, at, **config)
    data_dev = make_dataset_for_variational_hcn(dev_dialogs, dev_indices,
                                                vocab, et, at, **config)
    data_test = make_dataset_for_variational_hcn(test_dialogs, test_indices,
                                                 vocab, et, at, **config)

    random_input = generate_random_input_for_variational_hcn(
        10000, config['max_sequence_length'], vocab, rev_vocab)

    save_model(rev_vocab, config, kb, at.action_templates, in_model_folder)
    trainer = Trainer(data_train, data_dev, data_test, at.action_templates,
                      random_input, rev_vocab, config, in_model_folder)
    trainer.train()
def main(in_dataset_folder, in_result_file, in_max_size, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)
    w2v_model = KeyedVectors.load_word2vec_format(
        UtteranceEmbed.DEFAULT_W2V_FILE, binary=True)
    kb = make_augmented_knowledge_base(
        os.path.join(in_dataset_folder, 'dialog-babi-task6-dstc2-kb.txt'),
        os.path.join(in_dataset_folder,
                     'dialog-babi-task6-dstc2-candidates.txt'))

    vocab = [PAD, START, UNK, EOS] + list(kb.keys())
    vocab += [
        word for word, vocab_entry in sorted(
            w2v_model.vocab.items(), key=lambda x: x[1].count, reverse=True)
        [:in_max_size - len(vocab)]
    ]
    with open(in_result_file, 'w') as vocab_out:
        for word in vocab:
            print(word, file=vocab_out)
Beispiel #4
0
def main(in_dataset_folder, in_noisy_dataset_folder, in_custom_vocab_file,
         in_model_folder, in_config):
    with open(in_config, encoding='utf-8') as config_in:
        config = json.load(config_in)
    train_json = load_hcn_json(os.path.join(in_dataset_folder, 'train.json'))
    dev_json = load_hcn_json(os.path.join(in_dataset_folder, 'dev.json'))
    # test_json = load_hcn_json(os.path.join(in_dataset_folder, 'test.json'))
    test_ood_json = load_hcn_json(
        os.path.join(in_noisy_dataset_folder, 'test_ood.json'))

    kb = make_augmented_knowledge_base(
        os.path.join(BABI_FOLDER, 'dialog-babi-task6-dstc2-kb.txt'),
        os.path.join(BABI_FOLDER, 'dialog-babi-task6-dstc2-candidates.txt'))
    action_templates = train_json['actions']
    max_noisy_dialog_length = max(
        [len(dialog['turns']) for dialog in test_ood_json['dialogs']])
    config['max_input_length'] = max_noisy_dialog_length

    et = EntityTracker(kb)

    post_ood_turns_clean, post_ood_turns_noisy = mark_post_ood_turns(
        test_ood_json)

    if in_custom_vocab_file is not None:
        with open(in_custom_vocab_file) as vocab_in:
            rev_vocab = [line.rstrip() for line in vocab_in]
            vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    else:
        utterances_tokenized = []
        for dialog in train_json['dialogs']:
            for utterance in dialog['turns']:
                utterances_tokenized.append(utterance['input'].split())

        vocab, rev_vocab = make_vocabulary(
            utterances_tokenized,
            config['max_vocabulary_size'],
            special_tokens=[PAD, START, UNK, EOS] + list(kb.keys()))
    ctx_features = []
    for dialog in train_json['dialogs']:
        for utterance in dialog['turns']:
            if 'context_features' in utterance:
                ctx_features.append(utterance['context_features'])
    ctx_features_vocab, ctx_features_rev_vocab = make_vocabulary(
        ctx_features, config['max_vocabulary_size'], special_tokens=[])
    config['vocabulary_size'] = len(vocab)

    print('Training with config: {}'.format(json.dumps(config)))
    data_preparation_function = getattr(utils.preprocessing,
                                        config['data_preparation_function'])

    data_train = data_preparation_function(train_json, vocab,
                                           ctx_features_vocab, et, **config)
    data_dev = data_preparation_function(dev_json, vocab, ctx_features_vocab,
                                         et, **config)
    # data_test = data_preparation_function(test_json, vocab, ctx_features_vocab, et, **config)
    data_test_ood = data_preparation_function(test_ood_json, vocab,
                                              ctx_features_vocab, et, **config)

    dropout_turn_generation_function = getattr(
        utils.preprocessing, config['dropout_turn_generation_function'])
    random_input = dropout_turn_generation_function(
        10000, 3, config['max_sequence_length'], train_json, vocab,
        config['turn_word_dropout_prob'])

    save_model(rev_vocab, config, kb, action_templates, in_model_folder)
    net = getattr(modules, config['model_name'])(vocab, config,
                                                 len(ctx_features_vocab),
                                                 len(action_templates))
    trainer = Trainer(data_train, data_dev, data_test_ood, action_templates,
                      random_input, post_ood_turns_noisy, config, net,
                      in_model_folder)
    trainer.train()