def test_header_case_with_meta(uppercase_meta: bool, uppercase_data: bool, headers_ignore_case: bool): """ Tests whether the correct result is given using headers_ignore_case and either the data or metadata has captilalised column names. The result is the same as: (uppercase_data == uppercase_meta) OR headers_ignore_case i.e. they're both the same case or headers_ignore_case is True. """ test_folder = "tests/data/headers/" full_file_path = os.path.join(test_folder, "table1.csv") # get the meta and set the correct case for the col names metadata = Metadata.from_json( os.path.join(test_folder, "meta_data/table1.json")) if uppercase_meta: for c in metadata.columns: c["name"] = c["name"].upper() # get the data an set the correct case for the columns if uppercase_data: full_file_path = os.path.join(test_folder, "table1_uppercase.csv") else: full_file_path = os.path.join(test_folder, "table1.csv") # get the expected result expected_result = (uppercase_data == uppercase_meta) or headers_ignore_case # get the validator and validate table_params = {"headers-ignore-case": headers_ignore_case} pv = PandasValidator(full_file_path, table_params, metadata) pv.read_data_and_validate() # assert the result is as expected assert expected_result == pv.response.result["valid"]
def generate_to_meta(self, arrow_schema: pa.Schema, meta_init_dict: dict = None) -> Metadata: """Generates our metadata instance from an arrow schema Args: arrow_schema (pa.Schema): pa.Schema from an arrow table Returns: Metadata: An agnostic metadata instance """ if not meta_init_dict: meta_init_dict = {} if "columns" in meta_init_dict: warnings.warn( "columns key found in meta_init_dict will be overwritten") meta_init_dict["columns"] = [] meta_init_dict["_converted_from"] = "arrow_schema" for field in arrow_schema: meta_init_dict["columns"].append({ "name": field.name, "type": self.reverse_convert_col_type(field.type) }) m = Metadata.from_dict(meta_init_dict) return m
def validate_data(config: dict) -> ValidatorResult: validator_engine = config.get("validator-engine", "pandas") validator_params = config.get("validator-engine-params", {}) all_table_responses = [] for table_name, table_params in config["tables"].items(): table_params["lint-response"] = [] if table_params["matched_files"]: log.info(f"Linting {table_name}") meta_file_path = table_params.get("metadata", f"meta_data/{table_name}.json") meta_obj = Metadata.from_json(meta_file_path) meta_obj.set_col_type_category_from_types() metadata = meta_obj.to_dict() for i, matched_file in enumerate(table_params["matched_files"]): log.info(f"{matched_file} ...file {i+1} " f"of {len(table_params['matched_files'])}") validator = get_validator[validator_engine](matched_file, table_params, metadata, **validator_params) validator.read_data_and_validate() validator.write_validation_errors_to_log() table_response = { "valid": validator.valid, "response": validator.get_response_dict(), "original-path": matched_file, "table-name": table_name, } if table_response["valid"]: log.info("...file passed.") else: log.info("...file failed.") all_table_responses.append(table_response) else: msg4 = f"SKIPPING {table_name}. No files found." log.info(msg4) if all_table_responses: save_completion_status(config, all_table_responses) return validator.response
def _get_arrow_schema(schema: Union[pa.schema, Metadata, dict]): ac = ArrowConverter() if isinstance(schema, Metadata): schema = ac.generate_to_meta(schema) elif isinstance(schema, dict): schema = Metadata.from_dict(schema) schema = ac.generate_to_meta(schema) elif isinstance(schema, pa.Schema): pass else: raise TypeError(f"schema type not allowed: {type(schema)}") return schema
def generate_to_meta(self, database: str, table: str) -> Metadata: # get the table information glue_client = boto3.client("glue") resp = glue_client.get_table(DatabaseName=database, Name=table) # pull out just the columns columns = resp["Table"]["StorageDescriptor"]["Columns"] # convert the columns mojap_meta_cols = self.convert_columns(columns) # check for partitions partitions = resp["Table"].get("PartitionKeys") if partitions: # convert part_cols_full = self.convert_columns(partitions) # extend the mojap_meta_cols with the partiton cols mojap_meta_cols.extend(part_cols_full) part_cols = [p["name"] for p in part_cols_full] # make a metadata object meta = Metadata(name=table, columns=mojap_meta_cols, partitions=part_cols) else: meta = Metadata(name=table, columns=mojap_meta_cols) # get the file format if possible try: ff = resp["Table"]["StorageDescriptor"]["Parameters"].get( "classification") except KeyError: warnings.warn("unable to parse file format, please manually set") ff = None if ff: meta.file_format = ff.lower() return meta
def generate_to_meta( self, table_meta: TableMeta, data_format_mapper: Callable = None, col_type_mapper: Callable = None, ) -> Metadata: """Takes a TableMeta object and converts it to our Metadata object Args: etl_manager_table_meta (Metadata): TableMeta object from etl-manager data_format_mapper (Callable, optional): If not set the function will just set the file_format parameter to the str in the original data_format of the TableMeta. If you want to use your own mapper set a function object to this param e.g. data_format_mapper = my_lookup_dict.get col_type_mapper (Callable, option): If not set the col type conversion from TableMeta -> Metadata is done using the converters reverse_convert_col_type method. If you need a custom conversion set a function to this parameter to use said function instead of reverse_convert_col_type This callable should expect the TableMeta col type str and return the Metadata col type str name. Returns: TableMeta: An object from the TableMeta class in etl_manager.meta """ table_meta_dict = deepcopy(table_meta.to_dict()) renamed_params = {"data_format": "file_format"} for old_name, new_name in renamed_params.items(): table_meta_dict[new_name] = table_meta_dict.pop(old_name) if data_format_mapper: table_meta_dict["file_format"] = data_format_mapper( table_meta_dict["file_format"]) # remove etl_manager schema del table_meta_dict["$schema"] # convert columns etl_cols = table_meta_dict.pop("columns") for c in etl_cols: if col_type_mapper is None: c["type"] = self.reverse_convert_col_type(c["type"]) else: c["type"] = col_type_mapper(c["type"]) table_meta_dict["columns"] = etl_cols table_meta_dict["_converted_from"] = "etl_manager" return Metadata.from_dict(table_meta_dict)
def generate_from_meta( self, metadata: Union[Metadata, str, dict], database_name: str = None, table_location: str = None, run_msck_repair=False, ): """ Creates a glue table from metadata arguments: - metadata: Metadata object, string path, or dictionary metadata. - database_name (optional): name of the glue database the table is to be created in. can also be a property of the metadata. - table_location (optional): the s3 location of the table. can also be a property of the metadata. - run_msck_repair (optional): run msck repair table on the created table, should be set to True for tables with partitions. Raises: - ValueError if run_msck_repair table is False, metadata has partitions, and options.ignore_warnings is set to False """ # set database_name to metadata.database_name if none database_name = database_name if database_name else metadata.database_name # do the same with table_location table_location = table_location if table_location else metadata.table_location glue_client = boto3.client( "glue", region_name=os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "eu-west-1")), ) metadata = Metadata.from_infer(metadata) boto_dict = self.gc.generate_from_meta(metadata, database_name=database_name, table_location=table_location) delete_table_if_exists(database=database_name, table=metadata.name) glue_client.create_table(**boto_dict) if (not run_msck_repair and metadata.partitions and not self.options.ignore_warnings): w = ( "metadata has partitions and run_msck_reapair is set to false. To " "To supress these warnings set this converters " "options.ignore_warnings = True") warnings.warn(w) elif run_msck_repair: pydb.read_sql_query( f"msck repair table {database_name}.{metadata.name}")
def test_headers(file_name, expected_result): """ Tests files against the _read_data_and_validate function. runs each file and corresponding meta (table1 or table2). Against the additional table config params: - expected-headers is False - expected-headers is True and ignore-case is False - expected-headers is True and ignore-case is True In that order. Args: file_name ([str]): The filename in the dir tests/data/headers/ expected_results ([Tuple(bool)]): expected results for the 3 different config params listed above """ test_folder = "tests/data/headers/" full_file_path = os.path.join(test_folder, file_name) table_name = file_name.split(".")[0].split("_")[0] metadata = Metadata.from_json( os.path.join(test_folder, f"meta_data/{table_name}.json")) table_params = [ { "expect-header": False }, { "expect-header": True, "headers-ignore-case": False }, { "expect-header": True, "headers-ignore-case": True }, ] all_tests = [] for table_param in table_params: validator = PandasValidator(full_file_path, table_param, metadata) validator.read_data_and_validate() table_response = validator.response all_tests.append(table_response.result["valid"]) assert expected_result == all_tests