def test_read_all_file_body(s3, land_path):

    from data_linter.utils import read_all_file_body

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config.yaml")
    table_1_path = os.path.join(land_folder, "table1.csv")

    with open(config_path) as yml:
        config = yaml.safe_load(yml)
    with open(table_1_path) as f_in:
        table_1_body_actual = f_in.read()

    config["land-base-path"] = land_path

    set_up_s3(s3, land_folder, config)
    land_base_path = config["land-base-path"]
    table_1_body = read_all_file_body(f"{land_base_path}table1.csv")

    # Unix new line is \r\n, Mac new line is \n
    table_1_body = table_1_body.replace("\r\n", "\n")
    table_1_body_actual = table_1_body_actual.replace("\r\n", "\n")

    assert table_1_body == table_1_body_actual
Esempio n. 2
0
def para_collect_all_logs(config: Union[str, dict] = "config.yaml"):

    config = load_and_validate_config(config)

    log_base_path = config["log-base-path"]
    log_path_fin = get_main_log_path_from_config(config)
    log_base_path_is_s3 = log_base_path.startswith("s3://")

    tmp_log_base_path = get_temp_log_basepath(config)
    init_log_path = os.path.join(tmp_log_base_path, "init")
    val_log_path = os.path.join(tmp_log_base_path, "val")
    status_log_path = os.path.join(tmp_log_base_path, "status")

    if log_base_path_is_s3:
        init_log_paths = get_filepaths_from_s3_folder(init_log_path)
        val_log_paths = get_filepaths_from_s3_folder(val_log_path)
        status_log_paths = get_filepaths_from_s3_folder(status_log_path)
    else:
        init_log_paths = get_filepaths_from_local_folder(init_log_path)
        val_log_paths = get_filepaths_from_local_folder(val_log_path)
        status_log_paths = get_filepaths_from_local_folder(status_log_path)

    log_string_list = []
    for init_log_path in init_log_paths:
        log_string_list.append(read_all_file_body(init_log_path))
    for val_log_path in val_log_paths:
        log_string_list.append(read_all_file_body(val_log_path))
    for status_log_path in status_log_paths:
        log_string_list.append(read_all_file_body(status_log_path))

    log_io = io.StringIO()
    for log_str in log_string_list:
        log_io.write(log_str)
    upload_log(log, log_io, log_path_fin)

    log_path_del = os.path.join(log_base_path, "data_linter_temporary_fs")

    if log_base_path_is_s3:
        delete_s3_folder_contents(log_path_del)
    else:
        shutil.rmtree(log_path_del, ignore_errors=True)
Esempio n. 3
0
def load_and_validate_config(config: Union[str, dict] = "config.yaml") -> dict:
    """
    Loads and validates the config
    """

    if isinstance(config, str):
        config_raw_text = read_all_file_body(config)
        config = yaml.safe_load(config_raw_text)
    elif isinstance(config, dict):
        pass
    else:
        raise TypeError("Input 'config' must be a str or dict.")

    return _validate_and_clean_config(config)
def test_bin_pack_configs(s3, max_bin_count):

    from data_linter import validation
    from data_linter.utils import read_all_file_body
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder
    from botocore.exceptions import ClientError

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config_matched_files.yml")

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    set_up_s3(s3, land_folder, config)

    validation.bin_pack_configs(config, max_bin_count)

    land_base_path = config["land-base-path"]

    all_bin_packed_configs = get_filepaths_from_s3_folder(
        f"{land_base_path}/data_linter_temporary_storage/configs")

    for i, file_path in enumerate(all_bin_packed_configs):
        bin_pack_path = os.path.join(
            test_folder, f"bin_pack/config_{max_bin_count}_{i}.yml")
        with open(bin_pack_path) as yml:
            pre_bin_packed = yaml.safe_load(yml)

        try:
            actual_bin_pack = yaml.safe_load(read_all_file_body(file_path))
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
                assert pre_bin_packed is None
        else:
            assert actual_bin_pack == pre_bin_packed