def preprocess_csv(data_folder: str, dataset_name: str, holdout_name: str,
                   is_shuffled: bool):
    """
    Preprocessing for files tokens.csv, paths.csv, node_types.csv
    """
    dataset_path = path.join(data_folder, dataset_name)
    id_to_token_data_path = path.join(dataset_path,
                                      f"tokens.{holdout_name}.csv")
    id_to_type_data_path = path.join(dataset_path,
                                     f"node_types.{holdout_name}.csv")
    id_to_paths_data_path = path.join(dataset_path,
                                      f"paths.{holdout_name}.csv")
    path_contexts_path = path.join(dataset_path,
                                   f"path_contexts.{holdout_name}.csv")
    output_c2s_path = path.join(dataset_path,
                                f"{dataset_name}.{holdout_name}.c2s")

    id_to_paths = _get_id2value_from_csv(id_to_paths_data_path)
    id_to_paths = {
        index: [n for n in nodes.split()]
        for index, nodes in id_to_paths.items()
    }

    id_to_node_types = _get_id2value_from_csv(id_to_type_data_path)
    id_to_node_types = {
        index: node_type.rsplit(" ", maxsplit=1)[0]
        for index, node_type in id_to_node_types.items()
    }

    id_to_tokens = _get_id2value_from_csv(id_to_token_data_path)

    if path.exists(output_c2s_path):
        remove(output_c2s_path)
    with open(path_contexts_path,
              "r") as path_contexts_file, open(output_c2s_path,
                                               "a+") as c2s_output:
        output_lines = []
        for line in tqdm(path_contexts_file,
                         total=count_lines_in_file(path_contexts_path)):
            label, *path_contexts = line.split()
            parsed_line = [label]
            for path_context in path_contexts:
                from_token_id, path_types_id, to_token_id = path_context.split(
                    ",")
                from_token, to_token = id_to_tokens[
                    from_token_id], id_to_tokens[to_token_id]
                nodes = [
                    id_to_node_types[p_] for p_ in id_to_paths[path_types_id]
                ]
                parsed_line.append(",".join(
                    [from_token, "|".join(nodes), to_token]))
            output_lines.append(" ".join(parsed_line + ["\n"]))
        if is_shuffled:
            shuffle(output_lines)
        c2s_output.write("".join(output_lines))
def convert_holdout(holdout_name: str, vocab: Vocabulary, config: PreprocessingConfig, n_jobs: int):
    holdout_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.{holdout_name}.c2s")
    holdout_output_folder = path.join(DATA_FOLDER, config.dataset_name, holdout_name)
    create_folder(holdout_output_folder)
    with open(path.join(holdout_output_folder, DESCRIPTION_FILE), "w") as desc_file:
        desc_file.write("id,filename,n_samples,n_paths\n")
    with Pool(n_jobs) as pool:
        results = pool.imap(
            _convert_raw_buffer,
            (
                (lines, config, vocab, holdout_output_folder, pos)
                for pos, lines in enumerate(_read_file_by_batch(holdout_data_path, config.buffer_size))
            ),
        )
        n_buffers = ceil(count_lines_in_file(holdout_data_path) / config.buffer_size)
        _ = [_ for _ in tqdm(results, total=n_buffers)]
def collect_vocabulary(config: PreprocessingConfig) -> Vocabulary:
    target_counter = Counter()
    token_counter = Counter()
    type_counter = Counter()
    train_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.train.c2s")
    with open(train_data_path, "r") as train_file:
        for line in tqdm(train_file, total=count_lines_in_file(train_data_path)):
            label, *path_contexts = line.split()
            target_counter.update(_parse_token(label, config.split_target))
            cur_tokens = []
            cur_types = []
            for path_context in path_contexts:
                from_token, path_types, to_token = path_context.split(",")
                cur_tokens += _parse_token(from_token, config.split_names) + _parse_token(to_token, config.split_names)
                cur_types += path_types.split("|")
            token_counter.update(cur_tokens)
            type_counter.update(cur_types)
    return _vocab_from_counters(config, token_counter, target_counter, type_counter)