def main(in_babi, in_babi_plus, in_result_folder): babi_files = get_files_list(in_babi, 'task1-API-calls') babi_plus_files = get_files_list(in_babi_plus, 'task1-API-calls') if not os.path.exists(in_result_folder): os.makedirs(in_result_folder) for babi_file, babi_plus_file in zip(babi_files, babi_plus_files): babi = read_task(babi_file) babi_plus = read_task(babi_plus_file) result = [] for babi_dialogue, babi_plus_dialogue in zip(babi, babi_plus): for babi_turn, babi_plus_turn in zip(babi_dialogue[1], babi_plus_dialogue[1]): if babi_turn['agent'] == 'user': result.append({ 'babi': babi_turn['text'], 'babi_plus': babi_plus_turn['text'] }) with open(os.path.join(in_result_folder, os.path.basename(babi_file)), 'w') as result_out: print >> result_out, ';'.join(['babi_plus', 'babi']) for turn_pair in result: print >> result_out, ';'.join( [turn_pair['babi_plus'], turn_pair['babi']])
def main(in_babi, in_babi_plus, in_result_folder, in_output_format): babi_files = get_files_list(in_babi, 'task1-API-calls') babi_plus_files = get_files_list(in_babi_plus, 'task1-API-calls') if not os.path.exists(in_result_folder): os.makedirs(in_result_folder) for babi_file, babi_plus_file in zip(babi_files, babi_plus_files): babi = read_task(babi_file) babi_plus = read_task(babi_plus_file) result = [] for babi_dialogue, babi_plus_dialogue in zip(babi, babi_plus): for babi_turn, babi_plus_turn in zip(babi_dialogue[1], babi_plus_dialogue[1]): if babi_turn['agent'] == 'user' and babi_turn['text'].lower() != '<silence>': result.append({ 'babi': babi_turn['text'], 'babi_plus': babi_plus_turn['text'] }) if in_output_format == 'csv': save_csv(result, os.path.join(in_result_folder, os.path.basename(babi_file))) elif in_output_format == 'seq2seq': result_folder = os.path.join(in_result_folder, os.path.basename(babi_file)) if not os.path.exists(result_folder): os.makedirs(result_folder) save_seq2seq(result, result_folder)
def collect_babi_slot_values(in_babi_root): dataset_files = get_files_list(in_babi_root, 'task1-API-calls') babi_files = [(filename, read_task(filename)) for filename in dataset_files] full_babi = reduce(lambda x, y: x + y[1], babi_files, []) slots_map = extract_slot_values(full_babi) return reduce(lambda x, y: list(x) + list(y), slots_map.values(), [])
def main(in_input_file, in_result_file, in_config, in_result_size): ood = load_ood() dialogues = read_task(in_input_file) augmented_dialogues = augment_dataset( dialogues, ood, in_config, in_result_size if in_result_size else len(dialogues)) save_babi(augmented_dialogues, in_result_file)
def main(in_root, in_agent, filter_outliers=False): turns = [] turns_freq_dict = defaultdict(lambda: 0) for dialogue_name, dialogue in read_task(in_root): for turn in dialogue: if turn['agent'] == in_agent: turns.append(turn['text']) turns_freq_dict[turn['text']] += 1 frequency_threshold = np.percentile(turns_freq_dict.values(), 95) result = [turn for turn in turns if turns_freq_dict[turn] < frequency_threshold] for turn in result: print turn
def plus_dataset(in_src_root, in_result_size): dataset_files = get_files_list(in_src_root, 'task1-API-calls') babi_files = [(filename, read_task(filename)) for filename in dataset_files] full_babi = reduce(lambda x, y: x + y[1], babi_files, []) slots_map = extract_slot_values(full_babi) babi_plus = defaultdict(lambda: []) result_size = in_result_size if in_result_size else len(babi_files) for task_name, task in babi_files: for dialogue_index, dialogue in zip(xrange(result_size), cycle(task)): babi_plus[task_name].append( augment_dialogue(dialogue, slots_map.values())) return babi_plus
def main(in_config, in_babi_file, in_result_file): init(in_config) task = read_task(in_babi_file) slot_values = extract_slot_values(task) babi_plus_dialogues = plus_single_task(task, slot_values) utterances, tags, pos = [], [], [] for dialogue in babi_plus_dialogues: for turn in dialogue: if turn['agent'] == 'user': utterances.append(turn['text'].split()) tags.append(turn['tags']) pos.append(turn['pos']) result = pd.DataFrame({'utterance': utterances, 'tags': tags, 'pos': pos}) result.to_json(in_result_file) print_stats()
def configure_argument_parser(): parser = ArgumentParser(description='generate bAbI+ data') parser.add_argument('babi_file', help='file with bAbI Dialogs') parser.add_argument('babi_plus_root', help='output folder') parser.add_argument('--output_format', default='babi', help='format of output dialogues [babi/babble]') parser.add_argument( '--result_size', type=int, default=None, help='size of generated dataset [default=input dataset size]') parser.add_argument('--config', default=DEFAULT_CONFIG_FILE, help='dicustom disfluency config (json file)') return parser if __name__ == '__main__': parser = configure_argument_parser() args = parser.parse_args() init(args.config) task = read_task(args.babi_file) slot_values = extract_slot_values(task) task_name = path.basename(args.babi_file) babi_plus_dialogues = plus_single_task(task, slot_values) save_function = locals()['save_' + args.output_format] save_function({task_name: babi_plus_dialogues}, args.babi_plus_root) print_stats()
def collect_babi_slot_value_pps(in_babi_root, in_slot_values): dataset_files = get_files_list(in_babi_root, 'task1-API-calls') babi_files = [(filename, read_task(filename)) for filename in dataset_files] full_babi = reduce(lambda x, y: x + y[1], babi_files, []) return extract_slot_value_pps(full_babi, in_slot_values)