def test_rolling(rolling_data):
    import dtale.views as views

    df, _ = views.format_data(rolling_data)
    data_id, column_type = "1", "rolling"
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "0", "comp": "mean", "window": "5", "min_periods": 1}
        builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg)
        verify_builder(
            builder,
            lambda col: col.isnull().sum() == 0,
        )

        cfg = {
            "col": "0",
            "comp": "mean",
            "window": "5",
            "min_periods": 1,
            "on": "date",
            "center": True,
        }
        builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg)
        verify_builder(
            builder,
            lambda col: col.isnull().sum() == 0,
        )
Exemple #2
0
def test_winsorize():
    def _data():
        for i in range(100):
            a = i % 5
            b = i % 3
            c = i % 4
            yield dict(a=a, b=b, c=c, i=i)

    df = pd.DataFrame(list(_data()))
    data_id, column_type = "1", "winsorize"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(
            mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "i", "inclusive": [True, False], "limits": [0.1, 0.1]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.sum() == 4950)

        cfg = {"col": "i", "group": ["b"], "limits": [0.1, 0.1]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.sum() == 4950)

        cfg = {"col": "i"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.sum() == 4950)
Exemple #3
0
def test_encoder():
    df = pd.DataFrame({
        "car": ["Honda", "Benze", "Ford", "Honda", "Benze", "Ford", np.nan],
    })
    data_id, column_type = "1", "encoder"
    with ExitStack() as stack:
        stack.enter_context(
            mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "car", "algo": "one_hot"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(
            builder,
            lambda col: all([
                col[c].isnull().sum() == 0 for c in ["car_Ford", "car_Honda"]
            ]),
        )

        cfg = {"col": "car", "algo": "ordinal"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.isnull().sum() == 0)

        cfg = {"col": "car", "algo": "label"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.isnull().sum() == 0)

        cfg = {"col": "car", "algo": "feature_hasher", "n": 1}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col["car_0"].isnull().sum() == 0)
def test_encoder():
    df = pd.DataFrame({
        "car": ["Honda", "Benze", "Ford", "Honda", "Benze", "Ford", np.nan],
    })
    data_id, column_type = "1", "encoder"
    build_data_inst({data_id: df})

    cfg = {"col": "car", "algo": "one_hot"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(
        builder,
        lambda col: all(
            [col[c].isnull().sum() == 0 for c in ["car_Ford", "car_Honda"]]),
    )

    cfg = {"col": "car", "algo": "ordinal"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.isnull().sum() == 0)

    cfg = {"col": "car", "algo": "label"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.isnull().sum() == 0)

    cfg = {"col": "car", "algo": "feature_hasher", "n": 1}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col["car_0"].isnull().sum() == 0)
def test_from_object():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(
            mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {
            "col": "str_date",
            "to": "date",
            "from": "object",
            "fmt": "%Y%m%d"
        }
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(
            builder,
            lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") ==
            "20200101",
        )

        cfg = {"col": "str_date2", "to": "date", "from": "object"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(
            builder,
            lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") ==
            "20200101",
        )

        cfg = {"col": "str_bool", "to": "bool", "from": "object"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0])
def test_from_date():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(
            mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {
            "col": "date",
            "to": "str",
            "from": "datetime64",
            "fmt": "%m/%d/%Y"
        }
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == "01/01/2020")

        cfg = {
            "col": "date",
            "to": "int",
            "from": "datetime64",
            "unit": "YYYYMMDD"
        }
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == 20200101)

        cfg = {"col": "date", "to": "int", "from": "datetime64", "unit": "ms"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == 1577854800)
Exemple #7
0
def test_from_object():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "str_date", "to": "date", "from": "object", "fmt": "%Y%m%d"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(
        builder,
        lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") ==
        "20200101",
    )

    cfg = {"col": "str_date2", "to": "date", "from": "object"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(
        builder,
        lambda col: pd.Timestamp(col.values[0]).strftime("%Y%m%d") ==
        "20200101",
    )

    cfg = {"col": "str_bool", "to": "bool", "from": "object"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0])
def test_rolling(rolling_data):
    import dtale.views as views

    df, _ = views.format_data(rolling_data)
    data_id, column_type = "1", "rolling"
    build_data_inst({data_id: df})

    cfg = {"col": "0", "comp": "mean", "window": "5", "min_periods": 1}
    builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg)
    verify_builder(
        builder,
        lambda col: col.isnull().sum() == 0,
    )

    cfg = {
        "col": "0",
        "comp": "mean",
        "window": "5",
        "min_periods": 1,
        "on": "date",
        "center": True,
    }
    builder = ColumnBuilder(data_id, column_type, "0_rolling_mean", cfg)
    verify_builder(
        builder,
        lambda col: col.isnull().sum() == 0,
    )
Exemple #9
0
def test_concatenate():
    df = pd.DataFrame(dict(a=["a"], b=["b"]))

    data_id, column_type = "1", "concatenate"
    build_data_inst({data_id: df})

    cfg = dict(left=dict(col="a"), right=dict(col="b"))
    builder = ColumnBuilder(data_id, column_type, "a_b", cfg)
    verify_builder(builder, lambda col: col.values[0] == "ab")

    cfg = dict(left=dict(col="a"), right=dict(val="b"))
    builder = ColumnBuilder(data_id, column_type, "a_b2", cfg)
    verify_builder(builder, lambda col: col.values[0] == "ab")
Exemple #10
0
def test_from_bool():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "bool", "to": "int", "from": "bool"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == 1)

    cfg = {"col": "bool", "to": "str", "from": "bool"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "True")
Exemple #11
0
def test_from_string():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "str_num", "to": "int", "from": "str"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == 1)

        cfg = {"col": "str_num", "to": "float", "from": "str"}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == 1.5)
def test_standardize():
    df = pd.DataFrame(dict(a=randn(1000)))
    data_id, column_type = "1", "standardize"
    build_data_inst({data_id: df})

    cfg = {"col": "a", "algo": "power"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.isnull().sum() == 0)

    cfg = {"col": "a", "algo": "quantile"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.isnull().sum() == 0)

    cfg = {"col": "a", "algo": "robust"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.isnull().sum() == 0)
Exemple #13
0
def test_drop_all_space():
    df = pd.DataFrame(dict(foo=["a b"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})
    cfg = {"col": "foo", "cleaners": ["drop_all_space"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "ab")
Exemple #14
0
def test_drop_stopwords():
    df = pd.DataFrame(dict(foo=["foo bar biz"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})
    cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "foo biz")
def test_standardize():
    df = pd.DataFrame(dict(a=randn(1000)))
    data_id, column_type = "1", "standardize"
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "a", "algo": "power"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.isnull().sum() == 0)

        cfg = {"col": "a", "algo": "quantile"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.isnull().sum() == 0)

        cfg = {"col": "a", "algo": "robust"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.isnull().sum() == 0)
def test_string():
    df = pd.DataFrame(dict(a=[1], b=[2], c=["a"], d=[True]))
    data_id, column_type = "1", "string"
    build_data_inst({data_id: df})

    cfg = {"cols": list(df.columns), "joinChar": "-"}
    builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
    verify_builder(builder, lambda col: col.values[-1] == "1-2-a-True")
Exemple #17
0
def test_space_vals_to_empty():
    df = pd.DataFrame(dict(foo=["  ", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: sum(col == "") == 2)
Exemple #18
0
def test_hidden_chars():
    df = pd.DataFrame(dict(foo=["  ", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "foo", "cleaners": ["hidden_chars"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: sum(col.isnull()) == 0)
Exemple #19
0
def test_replace_hyphens_w_space():
    df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
def test_string():
    df = pd.DataFrame(dict(a=[1], b=[2], c=["a"], d=[True]))
    data_id, column_type = "1", "string"
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"cols": list(df.columns), "joinChar": "-"}
        builder = ColumnBuilder(data_id, column_type, "Col1", cfg)
        verify_builder(builder, lambda col: col.values[-1] == "1-2-a-True")
Exemple #21
0
def test_nltk_stopwords():
    pytest.importorskip("nltk")
    df = pd.DataFrame(dict(foo=["foo do biz"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})
    cfg = {"col": "foo", "cleaners": ["nltk_stopwords"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "foo biz")
Exemple #22
0
def test_from_float():
    df = conversion_data()
    data_id, column_type = "1", "type_conversion"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "float", "to": "int", "from": "float"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == 1)

    cfg = {"col": "float", "to": "str", "from": "float"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "1.5")

    cfg = {"col": "float", "to": "hex", "from": "float"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder,
                   lambda col: col.values[0] == "0x1.8000000000000p+0")
Exemple #23
0
def test_update_case(unittest):
    df = pd.DataFrame(dict(foo=["a b"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "foo", "cleaners": ["update_case"], "caseType": "upper"}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder, lambda col: col.values[0] == "A B")
Exemple #24
0
def test_drop_stopwords():
    df = pd.DataFrame(dict(foo=["foo bar biz"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "foo", "cleaners": ["stopwords"], "stopwords": ["bar"]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == "foo biz")
Exemple #25
0
def test_replace_hyphens_w_space():
    df = pd.DataFrame(dict(foo=["a‐b᠆c﹣d-e⁃f−g", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "foo", "cleaners": ["replace_hyphen_w_space"]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == "a b c d e f g")
Exemple #26
0
def test_hidden_chars():
    df = pd.DataFrame(dict(foo=["  ", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "foo", "cleaners": ["hidden_chars"]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: sum(col.isnull()) == 0)
Exemple #27
0
def test_replace():
    import dtale.views as views

    df, _ = views.format_data(pd.DataFrame({"A": ["foo_bar"]}))
    data_id, column_type = "1", "replace"
    build_data_inst({data_id: df})

    cfg = {"col": "A", "search": "_bar", "replacement": "_baz"}
    builder = ColumnBuilder(data_id, column_type, "A_replace", cfg)
    verify_builder(builder, lambda col: col.values[0] == "foo_baz")
Exemple #28
0
def test_multiple_cleaners(unittest):
    df = pd.DataFrame(dict(foo=["a999b", " "]))
    data_id, column_type = "1", "cleaning"
    i = 0
    build_data_inst({data_id: df})

    cfg = {"col": "foo", "cleaners": ["drop_numbers", "space_vals_to_empty"]}
    builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
    verify_builder(builder,
                   lambda col: sum(col == "") == 1 and col.values[0] == "ab")
Exemple #29
0
def test_space_vals_to_empty():
    df = pd.DataFrame(dict(foo=["  ", "", "a"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "foo", "cleaners": ["space_vals_to_empty"]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: sum(col == "") == 2)
Exemple #30
0
def test_keep_alpha(unittest):
    df = pd.DataFrame(dict(foo=["a999b"]))
    data_id, column_type = "1", "cleaning"
    i = 0
    with ExitStack() as stack:
        stack.enter_context(mock.patch("dtale.global_state.DATA", {data_id: df}))

        cfg = {"col": "foo", "cleaners": ["keep_alpha"]}
        builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg)
        verify_builder(builder, lambda col: col.values[0] == "ab")