def test_validation_multiple_workers(s3, monkeypatch): """ Simple example on how to run DL for multiple worker. [init] -> [worker]x4 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder test_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, test_folder, config) validation.para_run_init(4, config) # although ran sequencially this can be ran in parallel for i in range(4): validation.para_run_validation(i, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_compression(s3): from data_linter.utils import compress_data test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as f: config = yaml.safe_load(f) set_up_s3(s3, land_folder, config) test_file_uncompressed = "table2.jsonl" test_file_compressed = "table2.jsonl.gz" uncompressed_location = os.path.join(config["land-base-path"], test_file_uncompressed) compressed_location = os.path.join(config["pass-base-path"], test_file_compressed) compress_data(uncompressed_location, compressed_location) with tempfile.TemporaryDirectory() as d: with open(os.path.join(d, test_file_compressed), "wb") as file1: s3.meta.client.download_fileobj("pass", test_file_compressed, file1) with gzip.GzipFile(os.path.join(d, test_file_compressed), "r") as compressed_json: json_bytes = compressed_json.read() compressed_json_str = json_bytes.decode("utf-8") with open(os.path.join(land_folder, test_file_uncompressed)) as uncompressed_json: assert ( compressed_json_str == uncompressed_json.read() ), "uncompressed json doesn't contain the same data as compressed json"
def test_validation_single_worker(s3, monkeypatch): """ Simple example on how to run DL for a single worker. [init] -> [worker]x1 -> [closedown] """ # Need to mock S3 read for pyarrow (only for testing) monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) validation.para_run_init(1, config) validation.para_run_validation(0, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_read_all_file_body(s3, land_path): from data_linter.utils import read_all_file_body test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") table_1_path = os.path.join(land_folder, "table1.csv") with open(config_path) as yml: config = yaml.safe_load(yml) with open(table_1_path) as f_in: table_1_body_actual = f_in.read() config["land-base-path"] = land_path set_up_s3(s3, land_folder, config) land_base_path = config["land-base-path"] table_1_body = read_all_file_body(f"{land_base_path}table1.csv") # Unix new line is \r\n, Mac new line is \n table_1_body = table_1_body.replace("\r\n", "\n") table_1_body_actual = table_1_body_actual.replace("\r\n", "\n") assert table_1_body == table_1_body_actual
def test_end_to_end_single_file_config(s3, monkeypatch): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/land/" config = { "land-base-path": "s3://land/", "fail-base-path": "s3://fail/", "pass-base-path": "s3://pass/", "log-base-path": "s3://log/", "compress-data": True, "remove-tables-on-pass": True, "all-must-pass": True, "tables": { "table1": { "required": True, "metadata": "tests/data/end_to_end1/meta_data/table1.json", "expect-header": True, "matched-files": ["s3://land/table1.csv"], } }, } set_up_s3(s3, test_folder, config) validation.para_run_init(1, config) validation.para_run_validation(0, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config)
def test_end_to_end_full_path_spectrum(s3, tmpdir_factory, monkeypatch, land_path, fail_path, pass_path, log_path): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter.validation import run_validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as yml: config = yaml.safe_load(yml) if not fail_path.startswith("s3://"): fail_path = tmpdir_factory.mktemp(fail_path) fail_path = fail_path.strpath if not pass_path.startswith("s3://"): pass_path = tmpdir_factory.mktemp(pass_path) pass_path = pass_path.strpath if not log_path.startswith("s3://"): log_path = tmpdir_factory.mktemp(log_path) log_path = log_path.strpath config["land-base-path"] = land_path config["fail-base-path"] = fail_path config["pass-base-path"] = pass_path config["log-base-path"] = log_path set_up_s3(s3, land_folder, config) run_validation(config)
def test_parquet_linting(s3): from data_linter.validation import run_validation config = { "land-base-path": "s3://land/", "fail-base-path": "s3://fail/", "pass-base-path": "s3://pass/", "log-base-path": "s3://log/", "compress-data": True, "remove-tables-on-pass": False, "all-must-pass": True, "tables": { "table1": { "required": True, "metadata": "tests/data/end_to_end2/metadata/table1.json", "expect-header": True, } }, } land_folder = "tests/data/end_to_end2/land/" set_up_s3(s3, land_folder, config) config["land-base-path"] = land_folder run_validation(config)
def test_mitigations(s3, config, expected_pass): from data_linter.validation import validate_data land_folder = "tests/data/mitigations/data/" set_up_s3(s3, land_folder, config) response = validate_data(config) assert response.result["valid"] == expected_pass
def test_end_to_end_all_validators(s3, monkeypatch, validator): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter.validation import run_validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as f: config = yaml.safe_load(f) config["validator-engine"] = validator set_up_s3(s3, land_folder, config) run_validation(config)
def test_pandas_kwargs(s3, config_path, expected_pass): from data_linter.validation import validate_data land_folder = "tests/data/pandas_validator/" with open(config_path) as f: config = yaml.safe_load(f) config["tables"]["table1_na_test"]["matched_files"] = [ "s3://land/table1_na_test.csv" ] set_up_s3(s3, land_folder, config) response = validate_data(config) assert response.result["valid"] == expected_pass
def test_bin_count(s3, monkeypatch, max_bin_count): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as yml: config = yaml.safe_load(yml) set_up_s3(s3, land_folder, config) validation.para_run_init(max_bin_count, config_path) for i in range(max_bin_count): validation.para_run_validation(i, config_path) validation.para_collect_all_status(config_path) validation.para_collect_all_logs(config_path)
def test_end_to_end(s3, monkeypatch): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter.validation import run_validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as f: config = yaml.safe_load(f) # if config.get("validator-engine", "pandas") == "pandas": # config_path = os.path.join(test_folder, "config2.yaml") set_up_s3(s3, land_folder, config) run_validation(config_path) os.system( f"python data_linter/command_line.py --config-path {config_path}")
def test_end_to_end_full_path_spectrum_parallel( s3, monkeypatch, tmpdir_factory, land_path, fail_path, pass_path, log_path, ): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") max_bin_count = 3 with open(config_path) as yml: config = yaml.safe_load(yml) if not fail_path.startswith("s3://"): fail_path = tmpdir_factory.mktemp(fail_path) if not pass_path.startswith("s3://"): pass_path = tmpdir_factory.mktemp(pass_path) if not log_path.startswith("s3://"): log_path = tmpdir_factory.mktemp(log_path) config["land_path"] = land_path config["fail_path"] = fail_path config["pass_path"] = pass_path config["log_path"] = log_path set_up_s3(s3, land_folder, config) validation.para_run_init(max_bin_count, config_path) for i in range(max_bin_count): validation.para_run_validation(i, config_path) validation.para_collect_all_status(config_path) validation.para_collect_all_logs(config_path)
def test_bin_pack_configs(s3, max_bin_count): from data_linter import validation from data_linter.utils import read_all_file_body from dataengineeringutils3.s3 import get_filepaths_from_s3_folder from botocore.exceptions import ClientError test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config_matched_files.yml") with open(config_path) as yml: config = yaml.safe_load(yml) set_up_s3(s3, land_folder, config) validation.bin_pack_configs(config, max_bin_count) land_base_path = config["land-base-path"] all_bin_packed_configs = get_filepaths_from_s3_folder( f"{land_base_path}/data_linter_temporary_storage/configs") for i, file_path in enumerate(all_bin_packed_configs): bin_pack_path = os.path.join( test_folder, f"bin_pack/config_{max_bin_count}_{i}.yml") with open(bin_pack_path) as yml: pre_bin_packed = yaml.safe_load(yml) try: actual_bin_pack = yaml.safe_load(read_all_file_body(file_path)) except ClientError as e: if e.response["Error"]["Code"] == "NoSuchKey": assert pre_bin_packed is None else: assert actual_bin_pack == pre_bin_packed
def test_validation_multiple_workers_no_init(s3, monkeypatch): """ Simple example on how to run DL for multiple workers. But without using the init. You would want to do this if you want to specify which worker works on what specific dataset. In the example below we run 1 worker per table validation [worker]x2 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) import boto3 from data_linter import validation from data_linter.logging_functions import get_temp_log_basepath from dataengineeringutils3.s3 import ( s3_path_to_bucket_key, get_filepaths_from_s3_folder, ) s3_client = boto3.client("s3") land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) worker_config_path = os.path.join(get_temp_log_basepath(config), "configs") log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path) config = validation.load_and_validate_config(config) config = validation.match_files_in_land_to_config(config) # Create a config for worker 0 to only process table1 # (aka drop other tables in config) # and write to worker 0 config to s3 worker0_conf = deepcopy(config) del worker0_conf["tables"]["table2"] s3_client.put_object( Body=yaml.dump(worker0_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/0/config.yml", ) # Create a config for worker 1 to only process table2 # and write to worker 1 config to s3 worker1_conf = deepcopy(config) del worker1_conf["tables"]["table1"] s3_client.put_object( Body=yaml.dump(worker1_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/1/config.yml", ) validation.para_run_validation(0, config) validation.para_run_validation(1, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files