def preprocess_csv(data_folder: str, dataset_name: str, holdout_name: str, is_shuffled: bool): """ Preprocessing for files tokens.csv, paths.csv, node_types.csv """ dataset_path = path.join(data_folder, dataset_name) id_to_token_data_path = path.join(dataset_path, f"tokens.{holdout_name}.csv") id_to_type_data_path = path.join(dataset_path, f"node_types.{holdout_name}.csv") id_to_paths_data_path = path.join(dataset_path, f"paths.{holdout_name}.csv") path_contexts_path = path.join(dataset_path, f"path_contexts.{holdout_name}.csv") output_c2s_path = path.join(dataset_path, f"{dataset_name}.{holdout_name}.c2s") id_to_paths = _get_id2value_from_csv(id_to_paths_data_path) id_to_paths = { index: [n for n in nodes.split()] for index, nodes in id_to_paths.items() } id_to_node_types = _get_id2value_from_csv(id_to_type_data_path) id_to_node_types = { index: node_type.rsplit(" ", maxsplit=1)[0] for index, node_type in id_to_node_types.items() } id_to_tokens = _get_id2value_from_csv(id_to_token_data_path) if path.exists(output_c2s_path): remove(output_c2s_path) with open(path_contexts_path, "r") as path_contexts_file, open(output_c2s_path, "a+") as c2s_output: output_lines = [] for line in tqdm(path_contexts_file, total=count_lines_in_file(path_contexts_path)): label, *path_contexts = line.split() parsed_line = [label] for path_context in path_contexts: from_token_id, path_types_id, to_token_id = path_context.split( ",") from_token, to_token = id_to_tokens[ from_token_id], id_to_tokens[to_token_id] nodes = [ id_to_node_types[p_] for p_ in id_to_paths[path_types_id] ] parsed_line.append(",".join( [from_token, "|".join(nodes), to_token])) output_lines.append(" ".join(parsed_line + ["\n"])) if is_shuffled: shuffle(output_lines) c2s_output.write("".join(output_lines))
def convert_holdout(holdout_name: str, vocab: Vocabulary, config: PreprocessingConfig, n_jobs: int): holdout_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.{holdout_name}.c2s") holdout_output_folder = path.join(DATA_FOLDER, config.dataset_name, holdout_name) create_folder(holdout_output_folder) with open(path.join(holdout_output_folder, DESCRIPTION_FILE), "w") as desc_file: desc_file.write("id,filename,n_samples,n_paths\n") with Pool(n_jobs) as pool: results = pool.imap( _convert_raw_buffer, ( (lines, config, vocab, holdout_output_folder, pos) for pos, lines in enumerate(_read_file_by_batch(holdout_data_path, config.buffer_size)) ), ) n_buffers = ceil(count_lines_in_file(holdout_data_path) / config.buffer_size) _ = [_ for _ in tqdm(results, total=n_buffers)]
def collect_vocabulary(config: PreprocessingConfig) -> Vocabulary: target_counter = Counter() token_counter = Counter() type_counter = Counter() train_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.train.c2s") with open(train_data_path, "r") as train_file: for line in tqdm(train_file, total=count_lines_in_file(train_data_path)): label, *path_contexts = line.split() target_counter.update(_parse_token(label, config.split_target)) cur_tokens = [] cur_types = [] for path_context in path_contexts: from_token, path_types, to_token = path_context.split(",") cur_tokens += _parse_token(from_token, config.split_names) + _parse_token(to_token, config.split_names) cur_types += path_types.split("|") token_counter.update(cur_tokens) type_counter.update(cur_types) return _vocab_from_counters(config, token_counter, target_counter, type_counter)