def test_rolling(rolling_data): import dtale.views as views df, _ = views.format_data(rolling_data) data_id, column_type = "1", "rolling" with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "0", "comp": "mean", "window": "5", "min_periods": 1} builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg) verify_builder( builder, lambda col: col.isnull().sum() == 0, ) cfg = { "col": "0", "comp": "mean", "window": "5", "min_periods": 1, "on": "date", "center": True, } builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg) verify_builder( builder, lambda col: col.isnull().sum() == 0, )
def test_winsorize(): def _data(): for i in range(100): a = i % 5 b = i % 3 c = i % 4 yield dict(a=a, b=b, c=c, i=i) df = pd.DataFrame(list(_data())) data_id, column_type = "1", "winsorize" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "i", "inclusive": [True, False], "limits": [0.1, 0.1]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 4950) cfg = {"col": "i", "group": ["b"], "limits": [0.1, 0.1]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 4950) cfg = {"col": "i"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 4950)
def test_encoder(): df = pd.DataFrame({ "car": ["Honda", "Benze", "Ford", "Honda", "Benze", "Ford", np.nan], }) data_id, column_type = "1", "encoder" with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "car", "algo": "one_hot"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder( builder, lambda col: all([ col[c].isnull().sum() == 0 for c in ["car_Ford", "car_Honda"] ]), ) cfg = {"col": "car", "algo": "ordinal"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "car", "algo": "label"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "car", "algo": "feature_hasher", "n": 1} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col["car_0"].isnull().sum() == 0)
def test_encoder(): df = pd.DataFrame({ "car": ["Honda", "Benze", "Ford", "Honda", "Benze", "Ford", np.nan], }) data_id, column_type = "1", "encoder" build_data_inst({data_id: df}) cfg = {"col": "car", "algo": "one_hot"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder( builder, lambda col: all( [col[c].isnull().sum() == 0 for c in ["car_Ford", "car_Honda"]]), ) cfg = {"col": "car", "algo": "ordinal"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "car", "algo": "label"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "car", "algo": "feature_hasher", "n": 1} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col["car_0"].isnull().sum() == 0)
def test_from_object(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = { "col": "str_date", "to": "date", "from": "object", "fmt": "%Y%m%d" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_date2", "to": "date", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_bool", "to": "bool", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0])
def test_from_date(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = { "col": "date", "to": "str", "from": "datetime64", "fmt": "%m/%d/%Y" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "01/01/2020") cfg = { "col": "date", "to": "int", "from": "datetime64", "unit": "YYYYMMDD" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 20200101) cfg = {"col": "date", "to": "int", "from": "datetime64", "unit": "ms"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1577854800)
def test_from_object(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "str_date", "to": "date", "from": "object", "fmt": "%Y%m%d"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_date2", "to": "date", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_bool", "to": "bool", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0])
def test_rolling(rolling_data): import dtale.views as views df, _ = views.format_data(rolling_data) data_id, column_type = "1", "rolling" build_data_inst({data_id: df}) cfg = {"col": "0", "comp": "mean", "window": "5", "min_periods": 1} builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg) verify_builder( builder, lambda col: col.isnull().sum() == 0, ) cfg = { "col": "0", "comp": "mean", "window": "5", "min_periods": 1, "on": "date", "center": True, } builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg) verify_builder( builder, lambda col: col.isnull().sum() == 0, )
def test_concatenate(): df = pd.DataFrame(dict(a=["a"], b=["b"])) data_id, column_type = "1", "concatenate" build_data_inst({data_id: df}) cfg = dict(left=dict(col="a"), right=dict(col="b")) builder = ColumnBuilder(data_id, column_type, "a_b", cfg) verify_builder(builder, lambda col: col.values[0] == "ab") cfg = dict(left=dict(col="a"), right=dict(val="b")) builder = ColumnBuilder(data_id, column_type, "a_b2", cfg) verify_builder(builder, lambda col: col.values[0] == "ab")
def test_from_bool(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "bool", "to": "int", "from": "bool"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "bool", "to": "str", "from": "bool"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "True")
def test_from_string(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "str_num", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "str_num", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5)
def test_standardize(): df = pd.DataFrame(dict(a=randn(1000))) data_id, column_type = "1", "standardize" build_data_inst({data_id: df}) cfg = {"col": "a", "algo": "power"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "a", "algo": "quantile"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "a", "algo": "robust"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0)
def test_drop_all_space(): df = pd.DataFrame(dict(foo=["a b"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["drop_all_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "ab")
def test_drop_stopwords(): df = pd.DataFrame(dict(foo=["foo bar biz"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_standardize(): df = pd.DataFrame(dict(a=randn(1000))) data_id, column_type = "1", "standardize" with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "a", "algo": "power"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "a", "algo": "quantile"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0) cfg = {"col": "a", "algo": "robust"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.isnull().sum() == 0)
def test_string(): df = pd.DataFrame(dict(a=[1], b=[2], c=["a"], d=[True])) data_id, column_type = "1", "string" build_data_inst({data_id: df}) cfg = {"cols": list(df.columns), "joinChar": "-"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.values[-1] == "1-2-a-True")
def test_space_vals_to_empty(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 2)
def test_hidden_chars(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["hidden_chars"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col.isnull()) == 0)
def test_replace_hyphens_w_space(): df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
def test_string(): df = pd.DataFrame(dict(a=[1], b=[2], c=["a"], d=[True])) data_id, column_type = "1", "string" with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"cols": list(df.columns), "joinChar": "-"} builder = ColumnBuilder(data_id, column_type, "Col1", cfg) verify_builder(builder, lambda col: col.values[-1] == "1-2-a-True")
def test_nltk_stopwords(): pytest.importorskip("nltk") df = pd.DataFrame(dict(foo=["foo do biz"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["nltk_stopwords"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_from_float(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "float", "to": "int", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "float", "to": "str", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "1.5") cfg = {"col": "float", "to": "hex", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "0x1.8000000000000p+0")
def test_update_case(unittest): df = pd.DataFrame(dict(foo=["a b"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["update_case"], "caseType": "upper"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "A B")
def test_drop_stopwords(): df = pd.DataFrame(dict(foo=["foo bar biz"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_replace_hyphens_w_space(): df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
def test_hidden_chars(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["hidden_chars"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col.isnull()) == 0)
def test_replace(): import dtale.views as views df, _ = views.format_data(pd.DataFrame({"A": ["foo_bar"]})) data_id, column_type = "1", "replace" build_data_inst({data_id: df}) cfg = {"col": "A", "search": "_bar", "replacement": "_baz"} builder = ColumnBuilder(data_id, column_type, "A_replace", cfg) verify_builder(builder, lambda col: col.values[0] == "foo_baz")
def test_multiple_cleaners(unittest): df = pd.DataFrame(dict(foo=["a999b", " "])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["drop_numbers", "space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 1 and col.values[0] == "ab")
def test_space_vals_to_empty(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 2)
def test_keep_alpha(unittest): df = pd.DataFrame(dict(foo=["a999b"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["keep_alpha"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "ab")