def main(in_babi, in_babi_plus, in_result_folder):
    babi_files = get_files_list(in_babi, 'task1-API-calls')
    babi_plus_files = get_files_list(in_babi_plus, 'task1-API-calls')

    if not os.path.exists(in_result_folder):
        os.makedirs(in_result_folder)

    for babi_file, babi_plus_file in zip(babi_files, babi_plus_files):
        babi = read_task(babi_file)
        babi_plus = read_task(babi_plus_file)

        result = []
        for babi_dialogue, babi_plus_dialogue in zip(babi, babi_plus):
            for babi_turn, babi_plus_turn in zip(babi_dialogue[1],
                                                 babi_plus_dialogue[1]):
                if babi_turn['agent'] == 'user':
                    result.append({
                        'babi': babi_turn['text'],
                        'babi_plus': babi_plus_turn['text']
                    })

        with open(os.path.join(in_result_folder, os.path.basename(babi_file)),
                  'w') as result_out:
            print >> result_out, ';'.join(['babi_plus', 'babi'])
            for turn_pair in result:
                print >> result_out, ';'.join(
                    [turn_pair['babi_plus'], turn_pair['babi']])
def main(in_babi, in_babi_plus, in_result_folder, in_output_format):
    babi_files = get_files_list(in_babi, 'task1-API-calls')
    babi_plus_files = get_files_list(in_babi_plus, 'task1-API-calls')

    if not os.path.exists(in_result_folder):
        os.makedirs(in_result_folder)

    for babi_file, babi_plus_file in zip(babi_files, babi_plus_files):
        babi = read_task(babi_file)
        babi_plus = read_task(babi_plus_file)

        result = []
        for babi_dialogue, babi_plus_dialogue in zip(babi, babi_plus):
            for babi_turn, babi_plus_turn in zip(babi_dialogue[1], babi_plus_dialogue[1]):
                if babi_turn['agent'] == 'user' and babi_turn['text'].lower() != '<silence>':
                    result.append({
                        'babi': babi_turn['text'],
                        'babi_plus': babi_plus_turn['text']
                    })
        if in_output_format == 'csv':
            save_csv(result, os.path.join(in_result_folder, os.path.basename(babi_file)))
        elif in_output_format == 'seq2seq':
            result_folder = os.path.join(in_result_folder, os.path.basename(babi_file))
            if not os.path.exists(result_folder):
                os.makedirs(result_folder)
            save_seq2seq(result, result_folder)
Beispiel #3
0
def collect_babi_slot_values(in_babi_root):
    dataset_files = get_files_list(in_babi_root, 'task1-API-calls')
    babi_files = [(filename, read_task(filename))
                  for filename in dataset_files]
    full_babi = reduce(lambda x, y: x + y[1], babi_files, [])
    slots_map = extract_slot_values(full_babi)
    return reduce(lambda x, y: list(x) + list(y), slots_map.values(), [])
def main(in_input_file, in_result_file, in_config, in_result_size):
    ood = load_ood()
    dialogues = read_task(in_input_file)
    augmented_dialogues = augment_dataset(
        dialogues, ood, in_config,
        in_result_size if in_result_size else len(dialogues))
    save_babi(augmented_dialogues, in_result_file)
Beispiel #5
0
def main(in_root, in_agent, filter_outliers=False):
    turns = []
    turns_freq_dict = defaultdict(lambda: 0)
    for dialogue_name, dialogue in read_task(in_root):
        for turn in dialogue:
            if turn['agent'] == in_agent:
                turns.append(turn['text'])
                turns_freq_dict[turn['text']] += 1
    frequency_threshold = np.percentile(turns_freq_dict.values(), 95)
    result = [turn for turn in turns if turns_freq_dict[turn] < frequency_threshold]
    for turn in result:
        print turn
def plus_dataset(in_src_root, in_result_size):
    dataset_files = get_files_list(in_src_root, 'task1-API-calls')
    babi_files = [(filename, read_task(filename))
                  for filename in dataset_files]
    full_babi = reduce(lambda x, y: x + y[1], babi_files, [])
    slots_map = extract_slot_values(full_babi)
    babi_plus = defaultdict(lambda: [])
    result_size = in_result_size if in_result_size else len(babi_files)
    for task_name, task in babi_files:
        for dialogue_index, dialogue in zip(xrange(result_size), cycle(task)):
            babi_plus[task_name].append(
                augment_dialogue(dialogue, slots_map.values()))
    return babi_plus
def main(in_config, in_babi_file, in_result_file):
    init(in_config)
    task = read_task(in_babi_file)
    slot_values = extract_slot_values(task)
    babi_plus_dialogues = plus_single_task(task, slot_values)
    utterances, tags, pos = [], [], []

    for dialogue in babi_plus_dialogues:
        for turn in dialogue:
            if turn['agent'] == 'user':
                utterances.append(turn['text'].split())
                tags.append(turn['tags'])
                pos.append(turn['pos'])
    result = pd.DataFrame({'utterance': utterances, 'tags': tags, 'pos': pos})
    result.to_json(in_result_file)
    print_stats()
def configure_argument_parser():
    parser = ArgumentParser(description='generate bAbI+ data')
    parser.add_argument('babi_file', help='file with bAbI Dialogs')
    parser.add_argument('babi_plus_root', help='output folder')
    parser.add_argument('--output_format',
                        default='babi',
                        help='format of output dialogues [babi/babble]')
    parser.add_argument(
        '--result_size',
        type=int,
        default=None,
        help='size of generated dataset [default=input dataset size]')
    parser.add_argument('--config',
                        default=DEFAULT_CONFIG_FILE,
                        help='dicustom disfluency config (json file)')

    return parser


if __name__ == '__main__':
    parser = configure_argument_parser()
    args = parser.parse_args()
    init(args.config)
    task = read_task(args.babi_file)
    slot_values = extract_slot_values(task)
    task_name = path.basename(args.babi_file)
    babi_plus_dialogues = plus_single_task(task, slot_values)
    save_function = locals()['save_' + args.output_format]
    save_function({task_name: babi_plus_dialogues}, args.babi_plus_root)
    print_stats()
Beispiel #9
0
def collect_babi_slot_value_pps(in_babi_root, in_slot_values):
    dataset_files = get_files_list(in_babi_root, 'task1-API-calls')
    babi_files = [(filename, read_task(filename))
                  for filename in dataset_files]
    full_babi = reduce(lambda x, y: x + y[1], babi_files, [])
    return extract_slot_value_pps(full_babi, in_slot_values)