def test_recoding_reject(recoding_data): config["check_recoded"] = True config["correlations"]["recoded"] = True results = describe(recoding_data) assert (results["variables"]["y"]["type"] == Variable.S_TYPE_RECODED and results["variables"]["x"]["type"] == Variable.TYPE_CAT) or ( results["variables"]["x"]["type"] == Variable.S_TYPE_RECODED and results["variables"]["y"]["type"] == Variable.TYPE_CAT), "Type is wrong" assert ("correlation_var" in results["variables"]["y"] and results["variables"]["y"]["correlation_var"] == "x") or ("correlation_var" in results["variables"]["x"] and results["variables"]["x"]["correlation_var"] == "y"), "Values should be equal" expected_results = { "n_cells_missing": 0.0, Variable.S_TYPE_UNIQUE.value: 0, Variable.S_TYPE_CONST.value: 0, "nvar": 2, Variable.S_TYPE_REJECTED.value: 1, "n": 8, Variable.S_TYPE_RECODED.value: 1, Variable.S_TYPE_CORR.value: 0, Variable.TYPE_DATE.value: 0, Variable.TYPE_NUM.value: 0, Variable.TYPE_CAT.value: 1, "n_duplicates": 5, } for key in expected_results: assert (results["table"][key] == expected_results[key] ), "recoding error {}".format(key)
def test_describe_df(describe_data, expected_results): config["low_categorical_threshold"].set(0) describe_data_frame = pd.DataFrame(describe_data) describe_data_frame["somedate"] = pd.to_datetime( describe_data_frame["somedate"]) results = describe(describe_data_frame) assert { "table", "variables", "correlations", "missing", "messages", "package", } == set(results.keys()), "Not in results" assert set({ "CAT": 1, "CONST": 2, "DATE": 1, "NUM": 2, "UNIQUE": 2, "BOOL": 4, "REJECTED": 2, "RECODED": 0, "CORR": 0, "UNSUPPORTED": 3, "n": 9, "nvar": 15, "n_cells_missing": 6, "n_duplicates": 0, }.items()).issubset(set( results["table"].items())), "Variable analysis failed" # Loop over variables for col in describe_data.keys(): for k, v in expected_results[col].items(): if v == check_is_NaN: assert ( k not in results["variables"][col] ) == True, "Value {} for key {} in column {} is not NaN".format( results["variables"][col][k], k, col) elif isinstance(v, float): assert ( pytest.approx(v) == results["variables"][col][k] ), "Value {} for key {} in column {} is not NaN".format( results["variables"][col][k], k, col) else: assert ( v == results["variables"][col][k] ), "Value {} for key {} in column {} is not NaN".format( results["variables"][col][k], k, col) if results["variables"][col]["type"].value in ["NUM", "DATE"]: assert ("histogramdata" in results["variables"][col] ), "Mini-histogram missing for column {} ".format(col)
def test_describe_df(describe_data, expected_results): config["vars"]["num"]["low_categorical_threshold"].set(0) describe_data_frame = pd.DataFrame(describe_data) describe_data_frame["somedate"] = pd.to_datetime( describe_data_frame["somedate"]) results = describe("title", describe_data_frame) assert { "analysis", "table", "variables", "scatter", "correlations", "missing", "messages", "package", "sample", "duplicates", } == set(results.keys()), "Not in results" assert { "BOOL": 5, "CAT": 3, "UNSUPPORTED": 4, "NUM": 2, "DATE": 1 } == results["table"]["types"], "Variable analysis failed" # Loop over variables for col in describe_data.keys(): for k, v in expected_results[col].items(): if v == check_is_NaN: assert ( k not in results["variables"][col] ) == True, "Value `{}` for key `{}` in column `{}` is not NaN".format( results["variables"][col][k], k, col) elif isinstance(v, float): assert ( pytest.approx(v) == results["variables"][col][k] ), "Value `{}` for key `{}` in column `{}` is not NaN".format( results["variables"][col][k], k, col) else: assert ( v == results["variables"][col][k] ), "Value `{}` for key `{}` in column `{}` is not NaN".format( results["variables"][col][k], k, col) if results["variables"][col]["type"].value in ["NUM", "DATE"]: assert ("histogram" in results["variables"][col] ), "Histogram missing for column {} ".format(col)
def test_describe_df(column, describe_data, expected_results, summarizer): config = Settings() config.vars.num.low_categorical_threshold = 0 typeset = ProfilingTypeSet(config) describe_data_frame = pd.DataFrame({column: describe_data[column]}) if column == "somedate": describe_data_frame["somedate"] = pd.to_datetime( describe_data_frame["somedate"] ) results = describe(config, describe_data_frame, summarizer, typeset) assert { "analysis", "table", "variables", "scatter", "correlations", "missing", "messages", "package", "sample", "duplicates", } == set(results.keys()), "Not in results" # Loop over variables for k, v in expected_results[column].items(): if v == check_is_NaN: test_condition = k not in results["variables"][column] elif isinstance(v, float): test_condition = pytest.approx(v) == results["variables"][column][k] else: test_condition = v == results["variables"][column][k] assert ( test_condition ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN" if results["variables"][column]["type"] in ["Numeric", "DateTime"]: assert ( "histogram" in results["variables"][column] ), f"Histogram missing for column {column}"
def test_cramers_reject(recoding_data): recoding_data.loc[len(recoding_data)] = {"x": "chat", "y": "dog"} config["check_correlation_cramers"] = True config["correlation_threshold_cramers"] = 0.1 config["correlations"]["cramers"] = True results = describe(recoding_data) # The order of dicts is not preserved in Python 3.5 and not guaranteed in Python 3.6 assert ( results["variables"]["y"]["type"] == Variable.S_TYPE_CORR and results["variables"]["x"]["type"] == Variable.TYPE_CAT ) or ( results["variables"]["x"]["type"] == Variable.S_TYPE_CORR and results["variables"]["y"]["type"] == Variable.TYPE_CAT ), "Type is wrong" assert ( "correlation_var" in results["variables"]["y"] and results["variables"]["y"]["correlation_var"] == "x" ) or ( "correlation_var" in results["variables"]["x"] and results["variables"]["x"]["correlation_var"] == "y" ), "Values should be equal" expected_results = { "n_cells_missing": 0.0, Variable.S_TYPE_UNIQUE.value: 0, Variable.S_TYPE_CONST.value: 0, "nvar": 2, Variable.S_TYPE_REJECTED.value: 1, "n": 9, Variable.S_TYPE_RECODED.value: 0, Variable.S_TYPE_CORR.value: 1, Variable.TYPE_DATE.value: 0, Variable.TYPE_NUM.value: 0, Variable.TYPE_CAT.value: 1, "n_duplicates": 5, } for key in expected_results: assert ( results["table"][key] == expected_results[key] ), "recoding error {}".format(key)
def test_describe_list(): with pytest.raises(AttributeError): with pytest.warns(UserWarning): describe("", [1, 2, 3])
def test_describe_empty(): empty_frame = pd.DataFrame() with pytest.raises(ValueError): describe("", empty_frame)
def test_describe_list(): with pytest.raises(TypeError): describe([1, 2, 3])
def test_describe_list(summarizer, typeset): config = Settings() with pytest.raises(AttributeError), pytest.warns(UserWarning): describe(config, "", [1, 2, 3], summarizer, typeset)
def test_describe_list(summarizer, typeset): with pytest.raises(AttributeError): with pytest.warns(UserWarning): describe("", [1, 2, 3], summarizer, typeset)
def test_describe_empty(summarizer, typeset): empty_frame = pd.DataFrame() with pytest.raises(ValueError): describe("", empty_frame, summarizer, typeset)
def test_describe_list(summarizer, typeset): config = Settings() with pytest.raises(NotImplementedError): describe(config, "", [1, 2, 3], summarizer, typeset)