def test_round_column( test_config, round_columns, source_query, transformed_query, merge_columns, module_name, ): print(f"module_name: {module_name}") config = test_config add_to_tested_list( module_name=module_name, tested_fields=[y for x in round_columns.values() for y in x], ) source_sample_df = get_data_from_query(query=source_query, config=config, sort_col=merge_columns["source"], sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"]) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[transformed_df[ merge_columns["transformed"]].isin(sample_caserefs)] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name}" ) assert result_df.shape[0] > 0 for k, v in round_columns.items(): for i in v: result_df["round2dp"] = result_df[i].apply(lambda x: round(x, 2)) assert (result_df["round2dp"].astype(bool) == True).all()
def test_complex_joins( test_config, module_name, source_query, transformed_query, merge_columns, match_columns, ): log.debug(f"module_name: {module_name}") config = test_config source_sample_df = get_data_from_query(query=source_query, config=config, sort_col=merge_columns["source"], sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"]) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[transformed_df[ merge_columns["transformed"]].isin(sample_caserefs)] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name}" ) assert result_df.shape[0] > 0 for k, v in match_columns.items(): match = result_df[k].equals(result_df[v]) log.log( config.VERBOSE, f"checking {k} == {v}...." " {'OK' if match is True else 'oh no'} ", ) assert match is True
def test_one_to_one_joins( test_config, join_columns, merge_columns, fk_child_query, fk_parent_query, module_name, ): print(f"module_name: {module_name}") add_to_tested_list(module_name=module_name, tested_fields=[x for x in join_columns.keys()]) config = test_config fk_child_df = get_data_from_query( query=fk_child_query, config=config, sample=True, ) sample_caserefs = get_merge_col_data_as_list( df=fk_child_df, column_name=merge_columns["fk_child"]) fk_parent_df = get_data_from_query( query=fk_parent_query, config=config, sort_col=merge_columns["fk_parent"], sample=False, ) print(fk_parent_query) fk_parent_sample_df = fk_parent_df[fk_parent_df[ merge_columns["fk_parent"]].isin(sample_caserefs)] fk_child_id_list = fk_child_df[[k for k in join_columns][0]].tolist() fk_child_id_list = [int(x) for x in fk_child_id_list] fk_parent_id_list = fk_parent_sample_df[[ y for x in join_columns.values() for y in x.values() ][0]].tolist() fk_parent_id_list = [int(x) for x in fk_parent_id_list] print( f"Checking {fk_parent_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} " ) success = set(fk_child_id_list) == set(fk_parent_id_list) print( config.VERBOSE, f"checking {[k for k in join_columns][0]} == " f"{[y for x in join_columns.values() for y in x.values()][0]}.... " f"{'OK' if success is True else 'oh no'} ", ) assert success is True
def test_default_values(test_config, defaults, source_query, module_name): log.debug(f"module_name: {module_name}") config = test_config add_to_tested_list(module_name=module_name, tested_fields=[x for x in defaults.keys()]) source_sample_df = get_data_from_query(query=source_query, config=config, sample=True) assert source_sample_df.shape[0] > 0 log.debug( f"Checking {source_sample_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} " ) assert source_sample_df.shape[0] > 0 for k, v in defaults.items(): source_sample_df["compare_col"] = v matches = source_sample_df[k] == source_sample_df["compare_col"] total_matches = matches.sum() success = total_matches == source_sample_df.shape[0] print( config.VERBOSE, f"checking {k} == {v}...." f" {'OK' if success else 'oh no'} ", ) assert success
def test_deputy_address_joins(test_config, ): config = test_config source_query = f""" SELECT distinct surname from transform.persons where casrec_mapping_file_name = 'deputy_persons_mapping'; """ source_sample_df = get_data_from_query(query=source_query, config=config, sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list(df=source_sample_df, column_name="surname") result = {"total": 0, "passed": 0, "failed": []} for single_case in sample_caserefs: result["total"] += 1 transformed_query = f""" select persons.firstname, persons.surname, addresses.postcode from transform.persons left outer join transform.addresses on persons.id = addresses.person_id where persons.casrec_mapping_file_name = 'deputy_persons_mapping' and persons.surname = '{single_case}' order by firstname, surname, postcode; """ transformed_df = pd.read_sql_query( transformed_query, config.get_db_connection_string(db="migration")) casrec_query = f""" select distinct "Dep Forename" as firstname, "Dep Surname" as surname, "Dep Postcode" as postcode from casrec_csv.deputy left outer join casrec_csv.deputyship on deputy."Deputy No" = deputyship."Deputy No" left outer join casrec_csv.deputy_address on deputyship."Dep Addr No" = deputy_address."Dep Addr No" where deputy."Dep Surname" = '{single_case}' order by "Dep Forename", "Dep Surname", "Dep Postcode"; """ casrec_df = pd.read_sql_query( casrec_query, config.get_db_connection_string(db="migration")) try: pd.testing.assert_frame_equal(transformed_df, casrec_df) result["passed"] += 1 except AssertionError: result["failed"].append(single_case) print(f"result: {result['passed']}/{result['total']} passed") if len(result["failed"]) > 0: print(f"Failed caserecnumbers: {', '.join(result['failed'])}") assert len(result["failed"]) == 0
def test_order_deputy_joins(test_config, ): config = test_config source_query = f""" SELECT distinct caserecnumber from transform.cases; """ source_sample_df = get_data_from_query(query=source_query, config=config, sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list(df=source_sample_df, column_name="caserecnumber") result = {"total": 0, "passed": 0, "failed": []} for single_case in sample_caserefs: result["total"] += 1 transformed_query = f""" select cases.caserecnumber, persons.firstname, persons.surname from transform.cases left outer join transform.order_deputy on cases.id = order_deputy.order_id left outer join transform.persons on persons.id = order_deputy.deputy_id where cases.caserecnumber = '{single_case}' order by cases.caserecnumber, firstname, surname; """ transformed_df = pd.read_sql_query( transformed_query, config.get_db_connection_string(db="migration")) casrec_query = f""" select "order"."Case" as caserecnumber, deputy."Dep Forename" as firstname, deputy."Dep Surname" as surname from casrec_csv.order left outer join casrec_csv.deputyship on "order"."CoP Case" = deputyship."CoP Case" left outer join casrec_csv.deputy on deputyship."Deputy No" = deputy."Deputy No" where "order"."Case" = '{single_case}' order by "order"."Case", "Dep Forename", "Dep Surname"; """ casrec_df = pd.read_sql_query( casrec_query, config.get_db_connection_string(db="migration")) try: pd.testing.assert_frame_equal(transformed_df, casrec_df) result["passed"] += 1 except AssertionError: result["failed"].append(single_case) print(f"result: {round(result['passed']/result['total']*100, 2)}% passed") if len(result["failed"]) > 0: print(f"Failed caserecnumbers: {', '.join(result['failed'])}") assert len(result["failed"]) == 0
def test_calculated_values(test_config, calculated_fields, source_query, module_name): log.debug(f"module_name: {module_name}") print(f"module_name: {module_name}") # log.debug(f"source_query: {source_query}") config = test_config add_to_tested_list(module_name=module_name, tested_fields=[x for x in calculated_fields.keys()]) source_sample_df = get_data_from_query(query=source_query, config=config, sample=True) # print(source_sample_df.to_markdown()) source_sample_df.info() assert source_sample_df.shape[0] > 0 log.debug(f"Checking {source_sample_df.shape[0]} rows of data (" f"{config.SAMPLE_PERCENTAGE}%) from table: {module_name}") assert source_sample_df.shape[0] > 0 for k, v in calculated_fields.items(): print(f"k: {k}") print(f"v: {v}") source_sample_df["compare_col"] = v matches = source_sample_df[k] == source_sample_df["compare_col"] print(f"source_sample_df[k]: {source_sample_df[k]}") print( f'source_sample_df["compare_col"]: {source_sample_df["compare_col"]}' ) total_matches = matches.sum() success = total_matches == source_sample_df.shape[0] log.log( config.VERBOSE, f"checking {k} == {v}.... " f"{'OK' if success else 'oh no'} ", ) assert success
def test_simple_transformations( test_config, simple_matches, merge_columns, source_query, transformed_query, module_name, ): log.debug(f"module_name: {module_name}") add_to_tested_list( module_name=module_name, tested_fields=[y for x in simple_matches.values() for y in x] + [merge_columns["transformed"]], ) config = test_config source_sample_df = get_data_from_query( query=source_query, config=config, sort_col=merge_columns["source"], sample=True ) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"] ) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[ transformed_df[merge_columns["transformed"]].isin(sample_caserefs) ] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} " ) assert result_df.shape[0] > 0 for k, v in simple_matches.items(): for i in v: try: result_df[k] = pd.to_datetime(result_df[k], format="%Y-%m-%d") result_df[i] = pd.to_datetime(result_df[i], format="%Y-%m-%d") except Exception: result_df[k] = result_df[k].apply(lambda x: f"{x}") result_df[i] = result_df[i].apply(lambda x: f"{x}") match = result_df[k].equals(result_df[i]) print( f"checking {k} == {i}.... " f"{'OK' if match is True else 'oh no'} ", ) assert match is True
def test_map_conditional_lookup_tables( test_config, lookup_fields, merge_columns, source_query, transformed_query, module_name, ): print(f"module_name: {module_name}") add_to_tested_list( module_name=module_name, tested_fields=[x for x in lookup_fields.keys()] + [merge_columns["transformed"]], ) config = test_config source_sample_df = get_data_from_query(query=source_query, config=config, sort_col=merge_columns["source"], sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"]) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[transformed_df[ merge_columns["transformed"]].isin(sample_caserefs)] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} " ) assert result_df.shape[0] > 0 for k, v in lookup_fields.items(): lookup_filename = lookup_fields[k]["lookup_def"] for i, j in v.items(): result_col = lookup_fields[k]["cols"]["result"] reference_col = lookup_fields[k]["cols"]["reference"] lookup_dict = get_lookup_dict(file_name=lookup_filename.lower()) result_df["mapped"] = result_df[reference_col].map(lookup_dict) result_df = result_df.loc[result_df["mapped"].notnull()] result_df[result_col] = result_df[result_col].apply( lambda x: f"{x}") result_df[k] = result_df[k].apply(lambda x: f"{x}") assert result_df[result_col].fillna("").equals( result_df[k].fillna("")) match = result_df[result_col].fillna("").equals( result_df[k].fillna("")) print( f"checking {k} == {result_col}...." f"" f" {'OK' if match is True else 'oh no'} ", ) assert match is True
def test_squash_columns( test_config, squash_columns_fields, source_query, transformed_query, merge_columns, module_name, ): log.debug(f"module_name: {module_name}") config = test_config add_to_tested_list(module_name=module_name, tested_fields=[x for x in squash_columns_fields.keys()]) source_sample_df = get_data_from_query(query=source_query, config=config, sort_col=merge_columns["source"], sample=True) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"]) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[transformed_df[ merge_columns["transformed"]].isin(sample_caserefs)] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} " ) assert result_df.shape[0] > 0 for k, v in squash_columns_fields.items(): for i, j in enumerate(v): unsquashed = result_df[k].apply(pd.Series) unsquashed = unsquashed.rename(columns=lambda x: "unsq_" + str(x)) match = result_df[j].equals(unsquashed[f"unsq_{i}"]) log.log( config.VERBOSE, f"checking {j} == {k}:{i}...." f" {'OK' if match is True else 'oh no'} ", ) assert match is True
def test_convert_to_bool( test_config, convert_to_bool_fields, source_query, transformed_query, merge_columns, module_name, ): log.debug(f"module_name: {module_name}") config = test_config add_to_tested_list( module_name=module_name, tested_fields=[y for x in convert_to_bool_fields.values() for y in x], ) source_sample_df = get_data_from_query( query=source_query, config=config, sort_col=merge_columns["source"], sample=True ) assert source_sample_df.shape[0] > 0 sample_caserefs = get_merge_col_data_as_list( df=source_sample_df, column_name=merge_columns["source"] ) transformed_df = get_data_from_query( query=transformed_query, config=config, sort_col=merge_columns["transformed"], sample=False, ) assert transformed_df.shape[0] > 0 transformed_sample_df = transformed_df[ transformed_df[merge_columns["transformed"]].isin(sample_caserefs) ] result_df = merge_source_and_transformed_df( source_df=source_sample_df, transformed_df=transformed_sample_df, merge_columns=merge_columns, ) log.debug( f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name}" ) assert result_df.shape[0] > 0 for k, v in convert_to_bool_fields.items(): for i in v: true_values = ["true", "True", "t", "T", "1", "1.0"] true_rows = result_df[result_df[k].isin(true_values)] total_true_rows = true_rows.shape[0] true_matches = true_rows[i] total_true_matches = true_matches.shape[0] t_match = total_true_rows == total_true_matches log.log( config.VERBOSE, f"checking True: {total_true_rows} {k} ==" f" {total_true_matches} " f"{i}.... {'OK' if t_match is True else 'oh no'} ", ) assert t_match false_rows = result_df[~result_df[k].isin(true_values)] total_false_rows = false_rows.shape[0] false_matches = false_rows[i] total_false_matches = false_matches.shape[0] f_match = total_false_rows == total_false_matches log.log( config.VERBOSE, f"checking False: {total_false_rows} {k} ==" f" {total_false_matches} " f"{i}.... {'OK' if f_match is True else 'oh no'} ", ) assert f_match