def test_read(data_format, use_meta): test_data_path = f"tests/data/all_types.{data_format}" if use_meta: meta = { "columns": [ { "name": "my_float", "type": "float64", "type_category": "float" }, { "name": "my_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_nullable_bool", "type": "bool_", "type_category": "boolean", }, { "name": "my_date", "type": "date32", "type_category": "timestamp" }, { "name": "my_datetime", "type": "timestamp(s)", "type_category": "timestamp", }, { "name": "my_int", "type": "int64", "type_category": "integer" }, { "name": "my_string", "type": "string", "type_category": "string" }, ] } else: meta = None df1 = reader.read(test_data_path, meta) if data_format == "csv": df2 = reader.csv.read(test_data_path, meta) elif data_format == "jsonl": df2 = reader.json.read(test_data_path, meta) else: raise ValueError(f"Test wasn't expecting: {data_format}") assert_frame_equal(df1, df2)
def test_inferred_cols_pandas_types(data_format): df = reader.read(f"tests/data/all_types.{data_format}") test = df.dtypes.to_dict() assert isinstance(test["i"], pd.core.arrays.integer.Int64Dtype) assert isinstance(test["my_float"], type(np.dtype("float64"))) assert isinstance(test["my_bool"], pd.core.arrays.boolean.BooleanDtype) if data_format == "jsonl": pytest.skip("Pandas cannot infer bool with nulls from JSON datasets") else: assert isinstance(test["my_nullable_bool"], pd.core.arrays.boolean.BooleanDtype) assert isinstance(test["my_string"], pd.core.arrays.string_.StringDtype)
def get_all_errors_for_file(config_path: str, file_path: str): # get the config config = load_and_validate_config(config_path) # get the path of the logs required to read pull_logs_from = os.path.join(config["log-base-path"], "tables") # read the logs logs_df = reader.read(pull_logs_from, file_format="jsonl") # get the errors for the file in question from all the logs file_logs = logs_df[logs_df["original-path"] == file_path] # if the file logs has more than one entry, then it probably contains logs from more # than one lint run, lets tell the user that if len(file_logs) > 1: print( "More than one log for file, output may contain duplicate entries\n\n" "Entries show most recent first") # extract the timestamps from the log files file_logs["ts"] = file_logs["archived-path"].apply( lambda x: os.path.splitext(os.path.basename(x))[0].rsplit("-", 1)[1]) # sort in descending order file_logs = file_logs.sort_values(by="ts", ascending=False) # use this to collect the markdown tables list_of_markdown_tables = [] # for each file, generate a markdown table in descending order of the timestamp for i in range(len(file_logs)): # get the response dict current_response_dict = file_logs["response"][0] # make the markdown header template file_markdown = ( f"**file:** {file_logs['original-path'][i]}\n" f"**timestamp of run:** {file_logs['ts'][i]}\n\n" "column | test name | test result | percentage error | traceback/error\n" "--- | --- | --- | --- | ---\n") # add each column and test to the this files markdown table for col, tests in current_response_dict.items(): if col == "valid": continue # for each test in this column, make the markdown for it for test_name, test_result in tests.items(): if test_name == "valid": continue test_valid = "✅" if test_result["valid"] else "❌" percentage_error = test_result.get( "percentage_of_column_is_error", "n/a") tb = test_result.get("traceback", "n/a") file_markdown += ( f"{col} | {test_name} | {test_valid} | {percentage_error} | {tb}\n" ) list_of_markdown_tables.append(file_markdown + "\n\n") return Markdown("\n\n".join(list_of_markdown_tables))
def summary_of_all_tables(config_path: str): """ Summary measures: - overall validity - total number files that have failed as a percentage and number - count of failures per table """ # get the config config = load_and_validate_config(config_path) # make the logs path pull_logs_from = os.path.join(config["log-base-path"], "tables") # pull logs as df logs_df = reader.read(pull_logs_from, file_format="jsonl") # get overall valid overall_valid = "✅" if logs_df["valid"].all() else "❌" total = len(logs_df["valid"]) count_successes = logs_df["valid"].sum() # get number of failures count_fails = total - count_successes # get percentage of files that failed percentage_fails = (count_fails / total) * 100 count_fails = logs_df["valid"].value_counts().to_dict().get(False, 0) # make the summary markdown summary_markdown = ( "overall valid | fail percentage | fail count\n" "--- | --- | ---\n" f"{overall_valid} | {percentage_fails}% | {count_fails}") # get list of tables table_list = list(logs_df["table-name"].unique()) # get the failure count per table table_fails_markdown = ( "table | percentage of files failed | number of failed files\n" "--- | --- | ---\n") for table_name in table_list: # just get this tables deets table_log_df = logs_df[logs_df["table-name"] == table_name] # get percentage of fails table_percentage_fails = (table_log_df["valid"].value_counts( normalize=True).mul(100).to_dict().get(False, 0.0)) # get count of fails table_count_fails = table_log_df["valid"].value_counts().to_dict().get( False, 0) # add results to markdown table_fails_markdown += ( f"{table_name} | {table_percentage_fails} | {table_count_fails}\n") return Markdown(f"### overall summary \n{summary_markdown}\n" f"### per table summary \n{table_fails_markdown}\n")
def get_failed_files(config_path: str, table_name: str = None) -> Markdown: # set the table name table_name = "" if not table_name else table_name # get the config config = load_and_validate_config(config_path) # get the path of the logs required to read pull_logs_from = os.path.join(config["log-base-path"], "tables", table_name) # read the logs logs_df = reader.read(pull_logs_from, file_format="jsonl") # get all the failed paths trimmed = logs_df[logs_df["valid"] is False][[ "table-name", "original-path" ]] # return it as markdown return Markdown(trimmed.to_markdown())
def test_write_local_path_not_exist(data_format): # tests that if the path does not exist, the writer will not error with tempfile.TemporaryDirectory() as tmp_dir: df = reader.read("tests/data/all_types.csv") out_file = os.path.join(tmp_dir, f"does/not/exist/data.{data_format}") writer.write(df, out_file)
def test_write(data_format, use_meta): if use_meta: meta = { "columns": [ { "name": "my_float", "type": "float64", "type_category": "float" }, { "name": "my_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_nullable_bool", "type": "bool_", "type_category": "boolean", }, { "name": "my_date", "type": "date32", "type_category": "timestamp" }, { "name": "my_datetime", "type": "timestamp(s)", "type_category": "timestamp", }, { "name": "my_int", "type": "int64", "type_category": "integer" }, { "name": "my_string", "type": "string", "type_category": "string" }, ] } else: meta = None in_data_path = "tests/data/all_types.csv" df = reader.read(in_data_path, meta) # Create temp files with tempfile.NamedTemporaryFile(suffix=f".{data_format}") as f: tmp_out1 = f.name with tempfile.NamedTemporaryFile(suffix=f".{data_format}") as f: tmp_out2 = f.name writer.write(df, tmp_out1, meta) if data_format == "csv": writer.csv.write(df, tmp_out2, meta) elif data_format == "jsonl": writer.json.write(df, tmp_out2, meta) elif data_format in ["snappy.parquet", "parquet"]: writer.parquet.write(df, tmp_out2, meta) else: raise ValueError(f"Test wasn't expecting: {data_format}") with open(tmp_out1, "rb") as f: b1 = f.read() with open(tmp_out2, "rb") as f: b2 = f.read() assert b1 == b2
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: Metadata): """ Reads in the data from the given filepath and returns a dataframe """ # get the required sets of column names meta_col_names = [ c["name"] for c in metadata.columns if c["name"] not in metadata.partitions ] pandas_kwargs = table_params.get("pandas-kwargs", {}) # read data (and do headers stuff if csv) if filepath.lower().endswith("csv"): expect_header = table_params.get("expect-header", True) header = 0 if expect_header else None df = reader.read(filepath, header=header, low_memory=False, **pandas_kwargs) if not expect_header: df.columns = meta_col_names else: df = reader.read(filepath, **pandas_kwargs) # eliminate case sensitivity, if requested if table_params.get("headers-ignore-case"): for c in metadata.columns: c["name"] = c["name"].lower() df.columns = [c.lower() for c in df.columns] meta_col_names = [c.lower() for c in meta_col_names] allow_missing_cols = table_params.get("allow-missing-cols", False) allow_unexpected_data = table_params.get("allow-unexpected-data", False) cols_in_meta_but_not_data = [ c for c in meta_col_names if c not in df.columns ] cols_in_data_but_not_meta = [ c for c in df.columns if c not in meta_col_names ] cols_in_data_and_meta = [c for c in df.columns if c in meta_col_names] # error if there are no common columns if not cols_in_data_and_meta: raise ColumnError( "There is no commonality between the data and metadata") # this is so that both mitigations can be checked and both errors are made visible raise_column_error = False err_msg = "" # remove columns from meta that aren't in the data if allowed msg_1 = f"columns present in metadata but not in data: {cols_in_meta_but_not_data}" if (not allow_missing_cols) and cols_in_meta_but_not_data: err_msg += msg_1 raise_column_error = True elif allow_missing_cols and cols_in_meta_but_not_data: for col in cols_in_meta_but_not_data: metadata.remove_column(col) log.info("not testing " + msg_1) # error if there is unexepcted data, unless allowed msg_2 = f"columns present in data but not in metadata: {cols_in_data_but_not_meta}" if (not allow_unexpected_data) and cols_in_data_but_not_meta: err_msg += f"\n{msg_2}" raise_column_error = True elif allow_unexpected_data and cols_in_data_but_not_meta: log.info("not testing " + msg_2) df = df[cols_in_data_and_meta] # raise the error with all details, if required if raise_column_error: raise ColumnError(err_msg) # sample the data, if required if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if metadata.file_format not in ["parquet", "snappy.parquet"]: df = cast_pandas_table_to_schema(df, metadata) return df, metadata
def test_round_trip(trip1_file_format, trip2_file_format): meta = { "columns": [ { "name": "my_float", "type": "float64", "type_category": "float" }, { "name": "my_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_nullable_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_date", "type": "date32", "type_category": "timestamp" }, { "name": "my_datetime", "type": "timestamp(s)", "type_category": "timestamp", }, { "name": "my_int", "type": "int64", "type_category": "integer" }, { "name": "my_string", "type": "string", "type_category": "string" }, ] } original = reader.csv.read("tests/data/all_types.csv", meta) orig_copy = original.copy() # Trip 1 with tempfile.NamedTemporaryFile() as f: tmp_out_file1 = f.name writer.write(orig_copy, tmp_out_file1, file_format=trip1_file_format, metadata=meta) df_mid = reader.read(tmp_out_file1, file_format=trip1_file_format, metadata=meta) # Trip 2 with tempfile.NamedTemporaryFile() as f: tmp_out_file2 = f.name writer.write(df_mid, tmp_out_file2, file_format=trip2_file_format, metadata=meta) final = reader.read(tmp_out_file2, file_format=trip2_file_format, metadata=meta) assert_frame_equal(original, final)
def test_round_trip(): meta = { "columns": [ { "name": "my_float", "type": "float64", "type_category": "float" }, { "name": "my_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_nullable_bool", "type": "bool_", "type_category": "boolean" }, { "name": "my_date", "type": "date32", "type_category": "timestamp" }, { "name": "my_datetime", "type": "timestamp(s)", "type_category": "timestamp", }, { "name": "my_int", "type": "int64", "type_category": "integer" }, { "name": "my_string", "type": "string", "type_category": "string" }, ] } # Create parquet temp file with tempfile.NamedTemporaryFile(suffix=".parquet") as f: tmp_out_file = f.name original = reader.csv.read("tests/data/all_types.csv", meta) writer.parquet.write(original, tmp_out_file) data_paths = { "csv": "tests/data/all_types.csv", "json": "tests/data/all_types.jsonl", "parquet": tmp_out_file, } for type1 in ["csv", "json", "parquet"]: for type2 in ["csv", "json", "parquet"]: df1 = reader.read( input_path=data_paths[type1], metadata=meta, ) df2 = reader.read( input_path=data_paths[type2], metadata=meta, ) assert_frame_equal(df1, df2)