def test_validation_multiple_workers(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple worker.

    [init] -> [worker]x4 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    test_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, test_folder, config)

    validation.para_run_init(4, config)

    # although ran sequencially this can be ran in parallel
    for i in range(4):
        validation.para_run_validation(i, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_validation_single_worker(s3, monkeypatch):
    """
    Simple example on how to run DL for a single worker.

    [init] -> [worker]x1 -> [closedown]
    """

    # Need to mock S3 read for pyarrow (only for testing)
    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    validation.para_run_init(1, config)
    validation.para_run_validation(0, config)
    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_end_to_end_single_file_config(s3, monkeypatch):

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/land/"

    config = {
        "land-base-path": "s3://land/",
        "fail-base-path": "s3://fail/",
        "pass-base-path": "s3://pass/",
        "log-base-path": "s3://log/",
        "compress-data": True,
        "remove-tables-on-pass": True,
        "all-must-pass": True,
        "tables": {
            "table1": {
                "required": True,
                "metadata": "tests/data/end_to_end1/meta_data/table1.json",
                "expect-header": True,
                "matched-files": ["s3://land/table1.csv"],
            }
        },
    }

    set_up_s3(s3, test_folder, config)

    validation.para_run_init(1, config)
    validation.para_run_validation(0, config)
    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)
def test_bin_count(s3, monkeypatch, max_bin_count):

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config.yaml")

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    set_up_s3(s3, land_folder, config)

    validation.para_run_init(max_bin_count, config_path)
    for i in range(max_bin_count):
        validation.para_run_validation(i, config_path)
    validation.para_collect_all_status(config_path)
    validation.para_collect_all_logs(config_path)
def test_end_to_end_full_path_spectrum_parallel(
    s3,
    monkeypatch,
    tmpdir_factory,
    land_path,
    fail_path,
    pass_path,
    log_path,
):
    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config.yaml")
    max_bin_count = 3

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    if not fail_path.startswith("s3://"):
        fail_path = tmpdir_factory.mktemp(fail_path)
    if not pass_path.startswith("s3://"):
        pass_path = tmpdir_factory.mktemp(pass_path)
    if not log_path.startswith("s3://"):
        log_path = tmpdir_factory.mktemp(log_path)

    config["land_path"] = land_path
    config["fail_path"] = fail_path
    config["pass_path"] = pass_path
    config["log_path"] = log_path

    set_up_s3(s3, land_folder, config)

    validation.para_run_init(max_bin_count, config_path)
    for i in range(max_bin_count):
        validation.para_run_validation(i, config_path)
    validation.para_collect_all_status(config_path)
    validation.para_collect_all_logs(config_path)
def test_validation_multiple_workers_no_init(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple workers.
    But without using the init. You would want to do this
    if you want to specify which worker works on what specific dataset.
    In the example below we run 1 worker per table validation

    [worker]x2 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    import boto3
    from data_linter import validation
    from data_linter.logging_functions import get_temp_log_basepath

    from dataengineeringutils3.s3 import (
        s3_path_to_bucket_key,
        get_filepaths_from_s3_folder,
    )

    s3_client = boto3.client("s3")

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    worker_config_path = os.path.join(get_temp_log_basepath(config), "configs")
    log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path)

    config = validation.load_and_validate_config(config)
    config = validation.match_files_in_land_to_config(config)

    # Create a config for worker 0 to only process table1
    # (aka drop other tables in config)
    # and write to worker 0 config to s3
    worker0_conf = deepcopy(config)
    del worker0_conf["tables"]["table2"]
    s3_client.put_object(
        Body=yaml.dump(worker0_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/0/config.yml",
    )

    # Create a config for worker 1 to only process table2
    # and write to worker 1 config to s3
    worker1_conf = deepcopy(config)
    del worker1_conf["tables"]["table1"]
    s3_client.put_object(
        Body=yaml.dump(worker1_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/1/config.yml",
    )

    validation.para_run_validation(0, config)
    validation.para_run_validation(1, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files