def validate_from_chunked_configs(config: dict, config_num: int) -> bool: land_base_path = config["land-base-path"] land_base_path_is_s3 = land_base_path.startswith("s3://") if land_base_path_is_s3: tmp_log_bp = get_temp_log_basepath(config) s3_temp_path = os.path.join(tmp_log_bp, "configs", str(config_num)) config_file_paths = get_filepaths_from_s3_folder(s3_temp_path) if not config_file_paths: return False s3_client = boto3.client("s3") all_configs = [] for config_file_path in config_file_paths: bucket, key = s3_path_to_bucket_key(config_file_path) config_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_configs.append(yaml.safe_load(config_file_obj["Body"].read())) for config in all_configs: validate_data(config) return True else: raise ValueError("Local land path not supported for parrallel running")
def para_run_init(max_bin_count: int, config: Union[str, dict] = "config.yaml"): log.info("Loading config for parallelisation") log_path = None try: config = load_and_validate_config(config) _del_path(get_temp_log_basepath(config)) log_path = get_main_log_path_from_config(config) config = match_files_in_land_to_config(config) bin_pack_configs(config, max_bin_count) log.info("Running validation") except Exception as e: log_msg = f"Unexpected error. Uploading log to {log_path} before raising error." error_msg = str(e) log.error(log_msg) log.error(error_msg) upload_log(log, log_stringio, log_path) raise e.with_traceback(e.__traceback__) else: upload_log(log, log_stringio, get_temp_log_path_from_config(config))
def save_completion_status(config: dict, all_table_responses: List[dict]): """ saves the status of the table linting to a file to be colleted later Args: config: A data linter config all_table_responses: a list of dictionaries detailing whether it passed or failied linting, the validator response, the file linted, and the table name """ log_base_path_is_s3 = config["log-base-path"].startswith("s3://") temp_status_basepath = os.path.join(get_temp_log_basepath(config), "status") for table_response in all_table_responses: if log_base_path_is_s3: og_file_name = os.path.basename( table_response["original-path"]).split(".")[0] with tempfile.NamedTemporaryFile(suffix=".json", prefix=og_file_name) as tmp_file: try: with open(tmp_file.name, "w") as json_out: json.dump(table_response, json_out) except Exception as e: log.info(table_response) raise e tmp_file_name = os.path.basename(tmp_file.name) s3_temp_path = os.path.join(temp_status_basepath, tmp_file_name) local_file_to_s3(tmp_file.name, s3_temp_path) else: if not os.path.exists(temp_status_basepath): os.makedirs(temp_status_basepath, exist_ok=True) tmp_file_resp = tempfile.mkstemp(suffix=".json", dir=temp_status_basepath) tmp_filename = tmp_file_resp[1] try: with open(tmp_filename, "w") as json_out: json.dump(table_response, json_out) except Exception as e: log.info(table_response) raise e
def para_collect_all_logs(config: Union[str, dict] = "config.yaml"): config = load_and_validate_config(config) log_base_path = config["log-base-path"] log_path_fin = get_main_log_path_from_config(config) log_base_path_is_s3 = log_base_path.startswith("s3://") tmp_log_base_path = get_temp_log_basepath(config) init_log_path = os.path.join(tmp_log_base_path, "init") val_log_path = os.path.join(tmp_log_base_path, "val") status_log_path = os.path.join(tmp_log_base_path, "status") if log_base_path_is_s3: init_log_paths = get_filepaths_from_s3_folder(init_log_path) val_log_paths = get_filepaths_from_s3_folder(val_log_path) status_log_paths = get_filepaths_from_s3_folder(status_log_path) else: init_log_paths = get_filepaths_from_local_folder(init_log_path) val_log_paths = get_filepaths_from_local_folder(val_log_path) status_log_paths = get_filepaths_from_local_folder(status_log_path) log_string_list = [] for init_log_path in init_log_paths: log_string_list.append(read_all_file_body(init_log_path)) for val_log_path in val_log_paths: log_string_list.append(read_all_file_body(val_log_path)) for status_log_path in status_log_paths: log_string_list.append(read_all_file_body(status_log_path)) log_io = io.StringIO() for log_str in log_string_list: log_io.write(log_str) upload_log(log, log_io, log_path_fin) log_path_del = os.path.join(log_base_path, "data_linter_temporary_fs") if log_base_path_is_s3: delete_s3_folder_contents(log_path_del) else: shutil.rmtree(log_path_del, ignore_errors=True)
def run_validation(config: Union[str, dict] = "config.yaml"): """ Runs end to end validation based on config. Args: config (Union[str, dict], optional): Either a string specifying the path to a config yaml. Or a dict of a config in memory. Defaults to "config.yaml". Raises: Error: States where log is written if error is hit in validation and then raises traceback. """ log.info("Loading config") log_path = None try: config = load_and_validate_config(config) log_path = get_main_log_path_from_config(config) _del_path(get_temp_log_basepath(config)) log.info("Running validation") config = match_files_in_land_to_config(config) validate_data(config) collect_all_status(config) except Exception as e: log_msg = f"Unexpected error. Uploading log to {log_path} before raising error." error_msg = str(e) log.error(log_msg) log.error(error_msg) upload_log(log, log_stringio, log_path) raise e.with_traceback(e.__traceback__) else: upload_log(log, log_stringio, log_path)
def test_validation_multiple_workers_no_init(s3, monkeypatch): """ Simple example on how to run DL for multiple workers. But without using the init. You would want to do this if you want to specify which worker works on what specific dataset. In the example below we run 1 worker per table validation [worker]x2 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) import boto3 from data_linter import validation from data_linter.logging_functions import get_temp_log_basepath from dataengineeringutils3.s3 import ( s3_path_to_bucket_key, get_filepaths_from_s3_folder, ) s3_client = boto3.client("s3") land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) worker_config_path = os.path.join(get_temp_log_basepath(config), "configs") log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path) config = validation.load_and_validate_config(config) config = validation.match_files_in_land_to_config(config) # Create a config for worker 0 to only process table1 # (aka drop other tables in config) # and write to worker 0 config to s3 worker0_conf = deepcopy(config) del worker0_conf["tables"]["table2"] s3_client.put_object( Body=yaml.dump(worker0_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/0/config.yml", ) # Create a config for worker 1 to only process table2 # and write to worker 1 config to s3 worker1_conf = deepcopy(config) del worker1_conf["tables"]["table1"] s3_client.put_object( Body=yaml.dump(worker1_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/1/config.yml", ) validation.para_run_validation(0, config) validation.para_run_validation(1, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def collect_all_status(config: dict): """ collects the status files saved and determines whether the linting was a succes or not and copies/removes/compresses the files to and from the correct places Args: config: the config as given at the beggining with the paths of where to collect and save data from as well as compression, remove-on-pass etc. """ utc_ts = int(datetime.utcnow().timestamp()) land_base_path = config["land-base-path"] all_must_pass = config.get("all-must-pass", False) pass_base_path = config["pass-base-path"] log_base_path = config["log-base-path"] fail_base_path = config.get("fail-base-path") remove_on_pass = config.get("remove-tables-on-pass") compress = config.get("compress-data") timestamp_partition_name = config.get("timestamp-partition-name") land_base_path_is_s3 = land_base_path.startswith("s3://") log_base_path_is_s3 = log_base_path.startswith("s3://") temp_status_basepath = os.path.join(get_temp_log_basepath(config), "status") if log_base_path_is_s3: status_file_paths = get_filepaths_from_s3_folder(temp_status_basepath) s3_client = boto3.client("s3") all_table_response = [] for status_file_path in status_file_paths: bucket, key = s3_path_to_bucket_key(status_file_path) status_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_table_response.append( json.loads(status_file_obj["Body"].read())) else: status_file_paths = get_filepaths_from_local_folder( temp_status_basepath) all_table_response = [] for status_file_path in status_file_paths: with open(status_file_path) as json_in: all_table_response.append(json.load(json_in)) all_tables_passed = True pass_count = sum([i["valid"] for i in all_table_response]) if pass_count != len(all_table_response): all_tables_passed = False there_was_a_fail = False all_tables_to_fail = False all_tables_to_respective = False if all_tables_passed: all_tables_to_respective = True else: if all_must_pass: all_tables_to_fail = True else: all_tables_to_respective = True for i, table_response in enumerate(all_table_response): table_name = table_response.get("table-name") matched_file = table_response.get("original-path") file_basename = os.path.basename(matched_file) if all_tables_to_fail: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}") compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) elif all_tables_to_respective: if table_response["valid"]: final_outpath = get_out_path( pass_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) if remove_on_pass: log.info(f"Removing data in land: {matched_file}") if land_base_path_is_s3: delete_s3_object(matched_file) else: os.remove(matched_file) else: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) table_response["archived-path"] = final_outpath # write (table specific) log log_outpath = get_table_log_path(log_base_path, table_name, utc_ts, filenum=i) if log_base_path_is_s3: write_json_to_s3(table_response, log_outpath) else: path_name = os.path.dirname(log_outpath) os.makedirs(path_name, exist_ok=True) with open(log_outpath, "w") as json_out: json.dump(table_response, json_out) log.info(f"log for {matched_file} uploaded to {log_outpath}") if there_was_a_fail and all_must_pass: log.info("The following tables have failed: ") for failed_table in [i for i in all_table_response if not i["valid"]]: log.info(f"{failed_table['table-name']} failed") log.info(f"...original path: {failed_table['original-path']}") log.info(f"...out path: {failed_table['archived-path']}") _del_path(get_temp_log_basepath(config)) raise ValueError("Tables did not pass linter") if not all_must_pass and there_was_a_fail: msg6 = "Some tables failed but all_must_pass set to false." msg6 += " Check logs for details" log.info(msg6) _del_path(get_temp_log_basepath(config))
def bin_pack_configs(config: dict, max_bin_count: int): """ creates up to max_bin_count of config files by splitting the files from the config by size and grouping them into or below the average size of all the files Args: config: a config file specifying the files to be linted max_bin_count: the maximum of bins to split the files up into - optimal number is equal to the amount of workers available """ log_base_path = config.get("log-base-path") log_base_path_is_s3 = log_base_path.startswith("s3://") if log_base_path_is_s3: tmp_log_bp = get_temp_log_basepath(config) s3_temp_path = os.path.join(tmp_log_bp, "configs") file_list = [] # create a list of dictionaries, for each file with all attributes for table_name, table in config["tables"].items(): table_sans_files = deepcopy(table) mfiles = table_sans_files.pop("matched_files") for file_name in mfiles: table_sans_files["file-name"] = file_name table_sans_files["table-name"] = table_name file_list.append(deepcopy(table_sans_files)) # get the size of them all acum_file_size = 0 file_sizes = get_file_lengths(file_list) for index, file_size in file_sizes: file_list[index]["file-size"] = file_size acum_file_size += file_size target_bin_size = acum_file_size / max_bin_count # sort them in descending order file_list.sort(key=lambda x: -x["file-size"]) bins = [None] * max_bin_count offset = 0 for i in range(max_bin_count): curr_bin = [] curr_bin_size = 0 has_been_binned = False for j in range(offset, len(file_list)): if len(curr_bin) == 0: curr_bin.append(file_list[j]) curr_bin_size += file_list[j]["file-size"] offset += 1 else: if curr_bin_size <= target_bin_size: curr_bin.append(file_list[j]) curr_bin_size += file_list[j]["file-size"] offset += 1 else: bins[i] = curr_bin has_been_binned = True break if not has_been_binned: bins[i] = curr_bin bins[i] = curr_bin bins = [i for i in bins if i != []] # create the configs for the given bins for i, packed_bin in enumerate(bins): config_n = deepcopy(config) config_n.pop("tables") config_n["tables"] = {} for table in packed_bin: curr_table_name = table.pop("table-name") if config_n["tables"].get(curr_table_name): # it exists, so just add to matched files config_n["tables"][curr_table_name][ "matched_files"].append(table["file-name"]) else: # it doesn't exist, do a full copy of all attributes mfile = table.pop("file-name") table.pop("file-size") config_n["tables"][curr_table_name] = deepcopy(table) config_n["tables"][curr_table_name]["matched_files"] = [] config_n["tables"][curr_table_name][ "matched_files"].append(mfile) # upload the config to temp storage, into config with tempfile.NamedTemporaryFile(suffix=".yml", prefix="config_") as tmp_file: with open(tmp_file.name, "w") as yaml_out: yaml.dump(config_n, yaml_out, default_flow_style=False) tmp_file_name = tmp_file.name.split("/")[-1] s3_out_path = os.path.join(s3_temp_path, str(i), tmp_file_name) local_file_to_s3(tmp_file.name, s3_out_path) else: raise ValueError("Local land path not supported for parrallel running")