Ejemplo n.º 1
0
def set_up_s3(mocked_s3, test_folder, config, ext_filter=None):
    """
    Used to setup mocked s3 before a run that expects data in S3
    """
    if ext_filter is None:
        ext_filter = (".csv", ".jsonl", ".parquet")
    from dataengineeringutils3.s3 import s3_path_to_bucket_key

    land_base_path = config.get("land-base-path", "s3://land/")
    fail_base_path = config.get("fail-base-path", "s3://fail/")
    pass_base_path = config.get("pass-base-path", "s3://pass/")
    log_base_path = config.get("log-base-path", "s3://log/")

    land_base_path_is_s3 = land_base_path.startswith("s3://")
    fail_base_path_is_s3 = fail_base_path.startswith("s3://")
    pass_base_path_is_s3 = pass_base_path.startswith("s3://")
    log_base_path_is_s3 = log_base_path.startswith("s3://")

    buckets = []

    if land_base_path_is_s3:
        land_bucket, _ = s3_path_to_bucket_key(land_base_path)
        buckets.append(land_bucket)
    if fail_base_path_is_s3:
        fail_bucket, _ = s3_path_to_bucket_key(fail_base_path)
        buckets.append(fail_bucket)
    if pass_base_path_is_s3:
        pass_bucket, _ = s3_path_to_bucket_key(pass_base_path)
        buckets.append(pass_bucket)
    if log_base_path_is_s3:
        log_bucket, _ = s3_path_to_bucket_key(log_base_path)
        buckets.append(log_bucket)

    for b in buckets:
        mocked_s3.meta.client.create_bucket(
            Bucket=b,
            CreateBucketConfiguration={"LocationConstraint": "eu-west-1"},
        )

    files = [f for f in os.listdir(test_folder)]

    if ext_filter:
        files = [f for f in files if f.endswith(ext_filter)]

    if land_base_path_is_s3:
        for filename in files:
            full_path = os.path.join(test_folder, filename)
            mocked_s3.meta.client.upload_file(full_path, land_bucket, filename)
Ejemplo n.º 2
0
def validate_from_chunked_configs(config: dict, config_num: int) -> bool:

    land_base_path = config["land-base-path"]
    land_base_path_is_s3 = land_base_path.startswith("s3://")

    if land_base_path_is_s3:
        tmp_log_bp = get_temp_log_basepath(config)
        s3_temp_path = os.path.join(tmp_log_bp, "configs", str(config_num))

        config_file_paths = get_filepaths_from_s3_folder(s3_temp_path)
        if not config_file_paths:
            return False

        s3_client = boto3.client("s3")

        all_configs = []
        for config_file_path in config_file_paths:
            bucket, key = s3_path_to_bucket_key(config_file_path)
            config_file_obj = s3_client.get_object(Bucket=bucket, Key=key)
            all_configs.append(yaml.safe_load(config_file_obj["Body"].read()))

        for config in all_configs:
            validate_data(config)

        return True

    else:
        raise ValueError("Local land path not supported for parrallel running")
def compress_data(download_path: str, upload_path: str):

    download_path_is_s3 = download_path.startswith("s3://")
    upload_path_is_s3 = upload_path.startswith("s3://")

    if download_path_is_s3:
        s3_client = boto3.client("s3")

    if not upload_path_is_s3:
        upload_path_dir = os.path.dirname(upload_path)
        if not os.path.exists(upload_path_dir):
            os.makedirs(upload_path_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as temp_dir:

        if download_path_is_s3:
            bucket, key = s3_path_to_bucket_key(download_path)
            temp_file = os.path.join(temp_dir, key.split("/")[-1])
            with open(temp_file, "wb") as opened_temp_file:
                s3_client.download_fileobj(bucket, key, opened_temp_file)
        else:
            temp_file = os.path.join(temp_dir, download_path.split(os.path.sep)[-1])
            with open(temp_file, "wb") as opened_temp_file:
                shutil.copy(download_path, temp_file)

        with open(temp_file, "rb") as f_in, gzip.open(temp_file + ".gz", "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

        if upload_path_is_s3:
            write_local_file_to_s3(temp_file + ".gz", upload_path, overwrite=True)
        else:
            shutil.copy(temp_file + ".gz", upload_path)
def download_data(s3_path: str, local_path: str):
    s3_client = boto3.client("s3")
    dirname = os.path.dirname(local_path)
    Path(dirname).mkdir(parents=True, exist_ok=True)
    with open(local_path, "wb") as f:
        b, o = s3_path_to_bucket_key(s3_path)
        s3_client.download_fileobj(b, o, f)
def read_all_file_body(file_path: str) -> str:
    """
    Returns the text content of a file (will decode bytes if file read is bytes like)

    Args:
        file_path: A string specifying the location of the file to load text from.
        can be s3 or local
    """
    file_path_is_s3 = file_path.startswith("s3://")

    if file_path_is_s3:
        s3_client = boto3.client("s3")
        if not check_for_s3_file(file_path):
            raise FileNotFoundError("Path to config: {file_path}. Not found.")
        bucket, key = s3_path_to_bucket_key(file_path)
        file_obj = s3_client.get_object(Bucket=bucket, Key=key)
        file_obj_body = file_obj["Body"].read()
    else:
        with open(file_path) as f_in:
            file_obj_body = f_in.read()

    if isinstance(file_obj_body, bytes):
        return file_obj_body.decode("utf-8")
    else:
        return file_obj_body
Ejemplo n.º 6
0
 def open_input_stream(s3_file_path_in: str) -> io.BytesIO:
     s3_resource = boto3.resource("s3")
     bucket, key = s3_path_to_bucket_key(s3_file_path_in)
     obj_bytes = s3_resource.Object(bucket, key).get()["Body"].read()
     obj_io_bytes = io.BytesIO(obj_bytes)
     try:
         yield obj_io_bytes
     finally:
         obj_io_bytes.close()
    def write_to_s3(self):
        s3_resource = boto3.resource("s3")
        b, k = s3_path_to_bucket_key(self.get_s3_filepath())
        data = self.mem_file.getvalue()
        if self.compress_on_upload:
            data = self._compress_data(data)

        s3_resource.Object(b, k).put(Body=data)

        self.reset_file_buffer()
def local_file_to_s3(local_path: str, s3_path: str):
    s3_client = boto3.client("s3")

    if (not local_path.endswith(".gz")) and (s3_path.endswith(".gz")):
        new_path = local_path + ".gz"
        with open(local_path, "rb") as f_in, gzip.open(new_path, "wb") as f_out:
            f_out.writelines(f_in)
        local_path = new_path

    b, o = s3_path_to_bucket_key(s3_path)
    with open(local_path, "rb") as f:
        s3_client.upload_fileobj(f, b, o)
Ejemplo n.º 9
0
def upload_log(log: logging.Logger, log_stringio: io.StringIO, log_path: str):
    log_path_is_s3 = log_path.startswith("s3://")

    if log_path:
        if log_path_is_s3:
            s3_client = boto3.client("s3")
            b, k = s3_path_to_bucket_key(log_path)
            s3_client.put_object(Body=log_stringio.getvalue(), Bucket=b, Key=k)
        else:
            dir_out = os.path.dirname(log_path)
            if not os.path.exists(dir_out):
                os.makedirs(dir_out, exist_ok=True)
            with open(log_path, "w") as log_out:
                log_out.write(log_stringio.getvalue())
    else:
        log.error(
            "An error occurred but no log path registered, "
            "likely due to issue with config, so logs not saved."
        )
Ejemplo n.º 10
0
def test_download_fileobj(s3_client):
    # s3 is a fixture defined above that yields a boto3 s3 client.
    from dataengineeringutils3.s3 import s3_path_to_bucket_key

    s3_client.create_bucket(
        Bucket="somebucket",
        CreateBucketConfiguration={"LocationConstraint": "eu-west-1"},
    )
    s3_download_path = "somebucket/"
    bucket, key = s3_path_to_bucket_key(s3_download_path)

    table1 = "table1.csv"
    test_path = "tests/data/end_to_end1/land/"
    full_path = os.path.join(test_path, table1)
    s3_client.upload_file(full_path, bucket, table1)

    with open(table1, "wb") as downloaded_file:
        s3_client.download_fileobj(bucket, table1, downloaded_file)

    my_file = open(table1, "rb").read()

    assert my_file == open(full_path, "rb").read()
def test_validation_multiple_workers_no_init(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple workers.
    But without using the init. You would want to do this
    if you want to specify which worker works on what specific dataset.
    In the example below we run 1 worker per table validation

    [worker]x2 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    import boto3
    from data_linter import validation
    from data_linter.logging_functions import get_temp_log_basepath

    from dataengineeringutils3.s3 import (
        s3_path_to_bucket_key,
        get_filepaths_from_s3_folder,
    )

    s3_client = boto3.client("s3")

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    worker_config_path = os.path.join(get_temp_log_basepath(config), "configs")
    log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path)

    config = validation.load_and_validate_config(config)
    config = validation.match_files_in_land_to_config(config)

    # Create a config for worker 0 to only process table1
    # (aka drop other tables in config)
    # and write to worker 0 config to s3
    worker0_conf = deepcopy(config)
    del worker0_conf["tables"]["table2"]
    s3_client.put_object(
        Body=yaml.dump(worker0_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/0/config.yml",
    )

    # Create a config for worker 1 to only process table2
    # and write to worker 1 config to s3
    worker1_conf = deepcopy(config)
    del worker1_conf["tables"]["table1"]
    s3_client.put_object(
        Body=yaml.dump(worker1_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/1/config.yml",
    )

    validation.para_run_validation(0, config)
    validation.para_run_validation(1, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
Ejemplo n.º 12
0
def collect_all_status(config: dict):
    """
    collects the status files saved and determines whether the linting was a succes or
    not and copies/removes/compresses the files to and from the correct places

    Args:
    config: the config as given at the beggining with the paths of where to collect and
    save data from as well as compression, remove-on-pass etc.
    """

    utc_ts = int(datetime.utcnow().timestamp())
    land_base_path = config["land-base-path"]
    all_must_pass = config.get("all-must-pass", False)
    pass_base_path = config["pass-base-path"]
    log_base_path = config["log-base-path"]
    fail_base_path = config.get("fail-base-path")
    remove_on_pass = config.get("remove-tables-on-pass")
    compress = config.get("compress-data")
    timestamp_partition_name = config.get("timestamp-partition-name")

    land_base_path_is_s3 = land_base_path.startswith("s3://")
    log_base_path_is_s3 = log_base_path.startswith("s3://")
    temp_status_basepath = os.path.join(get_temp_log_basepath(config),
                                        "status")
    if log_base_path_is_s3:
        status_file_paths = get_filepaths_from_s3_folder(temp_status_basepath)

        s3_client = boto3.client("s3")

        all_table_response = []
        for status_file_path in status_file_paths:
            bucket, key = s3_path_to_bucket_key(status_file_path)
            status_file_obj = s3_client.get_object(Bucket=bucket, Key=key)
            all_table_response.append(
                json.loads(status_file_obj["Body"].read()))

    else:
        status_file_paths = get_filepaths_from_local_folder(
            temp_status_basepath)

        all_table_response = []
        for status_file_path in status_file_paths:
            with open(status_file_path) as json_in:
                all_table_response.append(json.load(json_in))

    all_tables_passed = True

    pass_count = sum([i["valid"] for i in all_table_response])

    if pass_count != len(all_table_response):
        all_tables_passed = False

    there_was_a_fail = False
    all_tables_to_fail = False
    all_tables_to_respective = False

    if all_tables_passed:
        all_tables_to_respective = True
    else:
        if all_must_pass:
            all_tables_to_fail = True
        else:
            all_tables_to_respective = True

    for i, table_response in enumerate(all_table_response):
        table_name = table_response.get("table-name")
        matched_file = table_response.get("original-path")
        file_basename = os.path.basename(matched_file)

        if all_tables_to_fail:
            there_was_a_fail = True
            final_outpath = get_out_path(
                fail_base_path,
                table_name,
                utc_ts,
                file_basename,
                compress=compress,
                filenum=i,
                timestamp_partition_name=timestamp_partition_name,
            )
            if compress:
                log.info(
                    f"Compressing file from {matched_file} to {final_outpath}")
                compress_data(matched_file, final_outpath)
            else:
                log.info(
                    f"Copying file from {matched_file} to {final_outpath}")
                copy_data(matched_file, final_outpath)
        elif all_tables_to_respective:
            if table_response["valid"]:
                final_outpath = get_out_path(
                    pass_base_path,
                    table_name,
                    utc_ts,
                    file_basename,
                    compress=compress,
                    filenum=i,
                    timestamp_partition_name=timestamp_partition_name,
                )
                if compress:
                    log.info(
                        f"Compressing file from {matched_file} to {final_outpath}"
                    )
                    compress_data(matched_file, final_outpath)
                else:
                    log.info(
                        f"Copying file from {matched_file} to {final_outpath}")
                    copy_data(matched_file, final_outpath)
                if remove_on_pass:
                    log.info(f"Removing data in land: {matched_file}")
                    if land_base_path_is_s3:
                        delete_s3_object(matched_file)
                    else:
                        os.remove(matched_file)

            else:
                there_was_a_fail = True
                final_outpath = get_out_path(
                    fail_base_path,
                    table_name,
                    utc_ts,
                    file_basename,
                    compress=compress,
                    filenum=i,
                    timestamp_partition_name=timestamp_partition_name,
                )
                if compress:
                    log.info(
                        f"Compressing file from {matched_file} to {final_outpath}"
                    )
                    compress_data(matched_file, final_outpath)
                else:
                    log.info(
                        f"Copying file from {matched_file} to {final_outpath}")
                    copy_data(matched_file, final_outpath)
        table_response["archived-path"] = final_outpath

        # write (table specific) log
        log_outpath = get_table_log_path(log_base_path,
                                         table_name,
                                         utc_ts,
                                         filenum=i)
        if log_base_path_is_s3:
            write_json_to_s3(table_response, log_outpath)
        else:
            path_name = os.path.dirname(log_outpath)
            os.makedirs(path_name, exist_ok=True)
            with open(log_outpath, "w") as json_out:
                json.dump(table_response, json_out)
        log.info(f"log for {matched_file} uploaded to {log_outpath}")

    if there_was_a_fail and all_must_pass:
        log.info("The following tables have failed: ")
        for failed_table in [i for i in all_table_response if not i["valid"]]:
            log.info(f"{failed_table['table-name']} failed")
            log.info(f"...original path: {failed_table['original-path']}")
            log.info(f"...out path: {failed_table['archived-path']}")
        _del_path(get_temp_log_basepath(config))
        raise ValueError("Tables did not pass linter")

    if not all_must_pass and there_was_a_fail:
        msg6 = "Some tables failed but all_must_pass set to false."
        msg6 += " Check logs for details"
        log.info(msg6)

    _del_path(get_temp_log_basepath(config))
Ejemplo n.º 13
0
def bin_pack_configs(config: dict, max_bin_count: int):
    """
    creates up to max_bin_count of config files by splitting the files from the config
    by size and grouping them into or below the average size of all the files

    Args:
        config: a config file specifying the files to be linted
        max_bin_count: the maximum of bins to split the files up into - optimal number
        is equal to the amount of workers available
    """

    log_base_path = config.get("log-base-path")
    log_base_path_is_s3 = log_base_path.startswith("s3://")

    if log_base_path_is_s3:
        tmp_log_bp = get_temp_log_basepath(config)
        s3_temp_path = os.path.join(tmp_log_bp, "configs")
        file_list = []

        # create a list of dictionaries, for each file with all attributes
        for table_name, table in config["tables"].items():
            table_sans_files = deepcopy(table)
            mfiles = table_sans_files.pop("matched_files")

            for file_name in mfiles:
                table_sans_files["file-name"] = file_name
                table_sans_files["table-name"] = table_name
                file_list.append(deepcopy(table_sans_files))

        # get the size of them all
        acum_file_size = 0
        for i, file_dict in enumerate(file_list):
            s3_client = boto3.client("s3")
            file_name = file_dict["file-name"]
            bucket, key = s3_path_to_bucket_key(file_name)
            obj = s3_client.get_object(Bucket=bucket, Key=key)
            file_size = obj.get("ContentLength")
            file_list[i]["file-size"] = file_size
            acum_file_size += file_size

        target_bin_size = acum_file_size / max_bin_count

        # sort them in descending order
        file_list.sort(key=lambda x: -x["file-size"])

        bins = [None] * max_bin_count

        offset = 0
        for i in range(max_bin_count):
            curr_bin = []
            curr_bin_size = 0
            has_been_binned = False
            for j in range(offset, len(file_list)):
                if len(curr_bin) == 0:
                    curr_bin.append(file_list[j])
                    curr_bin_size += file_list[j]["file-size"]
                    offset += 1
                else:
                    if curr_bin_size <= target_bin_size:
                        curr_bin.append(file_list[j])
                        curr_bin_size += file_list[j]["file-size"]
                        offset += 1
                    else:
                        bins[i] = curr_bin
                        has_been_binned = True
                        break
            if not has_been_binned:
                bins[i] = curr_bin

        bins[i] = curr_bin
        bins = [i for i in bins if i != []]

        # create the configs for the given bins
        for i, packed_bin in enumerate(bins):
            config_n = deepcopy(config)
            config_n.pop("tables")
            config_n["tables"] = {}

            for table in packed_bin:
                curr_table_name = table.pop("table-name")

                if config_n["tables"].get(curr_table_name):
                    # it exists, so just add to matched files
                    config_n["tables"][curr_table_name]["matched_files"].append(
                        table["file-name"]
                    )
                else:
                    # it doesn't exist, do a full copy of all attributes
                    mfile = table.pop("file-name")
                    table.pop("file-size")
                    config_n["tables"][curr_table_name] = deepcopy(table)
                    config_n["tables"][curr_table_name]["matched_files"] = []
                    config_n["tables"][curr_table_name]["matched_files"].append(mfile)

            # upload the config to temp storage, into config
            with tempfile.NamedTemporaryFile(
                suffix=".yml", prefix="config_"
            ) as tmp_file:

                with open(tmp_file.name, "w") as yaml_out:
                    yaml.dump(config_n, yaml_out, default_flow_style=False)

                tmp_file_name = tmp_file.name.split("/")[-1]
                s3_out_path = os.path.join(s3_temp_path, str(i), tmp_file_name)
                local_file_to_s3(tmp_file.name, s3_out_path)

    else:
        raise ValueError("Local land path not supported for parrallel running")
def s3_to_local(s3_path: str, local_path: str):
    s3_client = boto3.client("s3")
    bucket, key = s3_path_to_bucket_key(s3_path)

    with open(local_path, "wb") as opened_file:
        s3_client.download_fileobj(bucket, key, opened_file)
def test_s3_path_to_bucket_key(s3_path, exp_bucket, exp_key):
    bucket, key = s3_path_to_bucket_key(s3_path)
    assert bucket == exp_bucket
    assert key == exp_key