def test_drop_all_space(): df = pd.DataFrame(dict(foo=["a b"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["drop_all_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "ab")
def test_drop_stopwords(): df = pd.DataFrame(dict(foo=["foo bar biz"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_nltk_stopwords(): pytest.importorskip("nltk") df = pd.DataFrame(dict(foo=["foo do biz"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["nltk_stopwords"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_hidden_chars(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["hidden_chars"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col.isnull()) == 0)
def test_replace_hyphens_w_space(): df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
def test_space_vals_to_empty(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 2)
def test_update_case(unittest): df = pd.DataFrame(dict(foo=["a b"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["update_case"], "caseType": "upper"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "A B")
def test_hidden_chars(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["hidden_chars"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col.isnull()) == 0)
def test_replace_hyphens_w_space(): df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
def test_multiple_cleaners(unittest): df = pd.DataFrame(dict(foo=["a999b", " "])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["drop_numbers", "space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 1 and col.values[0] == "ab")
def test_drop_stopwords(): df = pd.DataFrame(dict(foo=["foo bar biz"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_space_vals_to_empty(): df = pd.DataFrame(dict(foo=[" ", "", "a"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: sum(col == "") == 2)
def test_keep_alpha(unittest): df = pd.DataFrame(dict(foo=["a999b"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["keep_alpha"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "ab")
def test_multiple_cleaners(unittest): df = pd.DataFrame(dict(foo=["a999b", " "])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["drop_numbers", "space_vals_to_empty"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: sum(col == "") == 1 and col.values[0] == "ab" )
def test_from_bool(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "bool", "to": "int", "from": "bool"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "bool", "to": "str", "from": "bool"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "True")
def test_normalize_accents(unittest): df = pd.DataFrame(dict(foo=["naive cafe"])) data_id, column_type = "1", "cleaning" i = 0 build_data_inst({data_id: df}) cfg = {"col": "foo", "cleaners": ["normalize_accents"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) def test(col): unittest.assertEqual(col.values[0], "naive cafe") return True verify_builder(builder, test)
def test_from_string(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "str_num", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "str_num", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5)
def test_normalize_accents(unittest): df = pd.DataFrame(dict(foo=["naive cafe"])) data_id, column_type = "1", "cleaning" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "foo", "cleaners": ["normalize_accents"]} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) def test(col): unittest.assertEqual(col.values[0], "naive cafe") return True verify_builder(builder, test)
def test_from_string(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "str_num", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "str_num", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5) df = pd.DataFrame( dict( a=[1, 2, 3, "", 5, 6, 7, 8, 9, 10], b=[True, True, False, "", "False", True, False, True, False, True], c=["1", "00", "1.05", " ", " ", "", "02", "..", "none", "nan"], ) ) with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "a", "to": "float", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 51) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "b", "to": "bool", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 5) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "c", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: col.sum() == 4.05 and col.isnull().sum() == 6 )
def test_from_object(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = { "col": "str_date", "to": "date", "from": "object", "fmt": "%Y%m%d" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_date2", "to": "date", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_bool", "to": "bool", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0])
def test_from_date(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = { "col": "date", "to": "str", "from": "datetime64", "fmt": "%m/%d/%Y" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "01/01/2020") cfg = { "col": "date", "to": "int", "from": "datetime64", "unit": "YYYYMMDD" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 20200101) cfg = {"col": "date", "to": "int", "from": "datetime64", "unit": "ms"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1577854800)
def test_from_object(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "str_date", "to": "date", "from": "object", "fmt": "%Y%m%d"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_date2", "to": "date", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "str_bool", "to": "bool", "from": "object"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0])
def test_from_float(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "float", "to": "int", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "float", "to": "str", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "1.5") cfg = {"col": "float", "to": "hex", "from": "float"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "0x1.8000000000000p+0")
def test_from_category(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "cat_int", "to": "int", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "cat_bool", "to": "bool", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: isinstance(col.values[0], (np.bool, np.bool_)) and col. values[0] in [np.bool(True), np.bool_(True)], ) cfg = {"col": "cat_str", "to": "str", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a")
def test_from_category(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "cat_int", "to": "int", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "cat_bool", "to": "bool", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: isinstance(col.values[0], np.bool_) and np.bool_(True) == col.values[0], ) cfg = {"col": "cat_str", "to": "str", "from": "category"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "a")
def test_from_date(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "date", "to": "str", "from": "datetime64", "fmt": "%m/%d/%Y"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "01/01/2020") cfg = { "col": "date", "to": "int", "from": "datetime64", "unit": "YYYYMMDD" } builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 20200101) cfg = {"col": "date", "to": "int", "from": "datetime64", "unit": "ms"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) # this fails due to local machine timezone. The time here is "Jan 01 2020 05:00:00 GMT+0000" verify_builder(builder, lambda col: col.values[0] == 1577854800)
def test_from_int(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "int", "to": "float", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.0) cfg = {"col": "int", "to": "str", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "1") cfg = {"col": "int", "to": "category", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.dtype.name == "category") cfg = {"col": "int", "to": "bool", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: isinstance(col.values[0], np.bool_) and np.bool_(True) == col.values[0], ) cfg = {"col": "int_date", "to": "date", "from": "int", "unit": "YYYYMMDD"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "int_s", "to": "date", "from": "int", "unit": "s"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20170322", ) cfg = {"col": "int", "to": "hex", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "0x1")
def test_from_int(): df = conversion_data() data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "int", "to": "float", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.0) cfg = {"col": "int", "to": "str", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == "1") cfg = {"col": "int", "to": "category", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.dtype.name == "category") cfg = {"col": "int", "to": "bool", "from": "int"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: isinstance(col.values[0], np.bool_) and np.bool_(True) == col.values[0], ) cfg = {"col": "int_date", "to": "date", "from": "int", "unit": "YYYYMMDD"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20200101", ) cfg = {"col": "int_s", "to": "date", "from": "int", "unit": "s"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") == "20170322", )
def test_from_string(): df = conversion_data() df.loc[:, "hex_int"] = df["int"].apply(hex) df.loc[:, "hex_float"] = df["float"].apply(float.hex) data_id, column_type = "1", "type_conversion" i = 0 build_data_inst({data_id: df}) cfg = {"col": "str_num", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "str_num", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5) cfg = {"col": "hex_int", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "hex_float", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5) df = pd.DataFrame( dict( a=[1, 2, 3, "", 5, 6, 7, 8, 9, 10], b=[True, True, False, "", "False", True, False, True, False, True], c=["1", "00", "1.05", " ", " ", "", "02", "..", "none", "nan"], )) build_data_inst({data_id: df}) cfg = {"col": "a", "to": "float", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 51) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "b", "to": "bool", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 5) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "c", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 4.05 and col.isnull().sum() == 6)