예제 #1
0
def test_guess_type_of_variable_boolean(sparse):
    s = pd.Series([True, False, True, None] * 10)

    s = _convert_sparse(s, sparse)

    assert guess_type_of_variable(s) == TypeOfVariables.CAT

    s = pd.Series([True, False, True] * 10)
    s = _convert_sparse(s, sparse)

    assert guess_type_of_variable(s) == TypeOfVariables.CAT
예제 #2
0
def test_ColumnsSelector__get_list_of_columns():
    X = pd.DataFrame({"a": [0, 1, 2], "b": ["AAA", "BBB", "CCC"], "c": ["xx", "yy", "zz"], "d": [0.1, 0.2, 0.3]})

    assert ColumnsSelector._get_list_of_columns("all", X, regex_match=False) is None

    assert ColumnsSelector._get_list_of_columns(["a"], X, regex_match=False) == ["a"]
    assert ColumnsSelector._get_list_of_columns(["a", "b"], X, regex_match=False) == ["a", "b"]
    assert ColumnsSelector._get_list_of_columns([0, 1, 2], X, regex_match=False) == [0, 1, 2]

    assert ColumnsSelector._get_list_of_columns(None, X, regex_match=False) == []

    assert ColumnsSelector._get_list_of_columns("object", X, regex_match=False) == ["b", "c"]

    with pytest.raises(TypeError):
        ColumnsSelector._get_list_of_columns(X.values, "object", regex_match=False)  # error : because no DataFrame

    with pytest.raises(TypeError):
        ColumnsSelector._get_list_of_columns({"type": "not recognized"}, X, regex_match=False)

    with pytest.raises(ValueError):
        ColumnsSelector._get_list_of_columns("object", X, regex_match=True)  # error : because regex_match

    for columns in TypeOfVariables.alls:
        assert ColumnsSelector._get_list_of_columns(columns, X) == [
            c for c in X.columns if guess_type_of_variable(X[c]) == columns
        ]
예제 #3
0
    def guess_columns_to_encode(X):
        """ guess which columns should be encoded """
        cols = []
        for c in list(X.columns):
            if guess_type_of_variable(X[c]) == TypeOfVariables.CAT:
                cols.append(c)

        return cols
예제 #4
0
def test_guess_type_of_variable():
    df = get_sample_df(100)
    df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])

    assert guess_type_of_variable(df["float_col"]) == "NUM"
    assert guess_type_of_variable(df["int_col"]) == "NUM"
    assert guess_type_of_variable(df["text_col"]) == "TEXT"
    assert guess_type_of_variable(df["cat_col_1"]) == "CAT"

    df_with_cat = df.copy()
    df_with_cat["cat_col_1"] = df_with_cat["cat_col_1"].astype("category")
    assert np.all([guess_type_of_variable(df[col]) == guess_type_of_variable(df_with_cat[col]) for col in df.columns])
    assert (df.values == df_with_cat.values).all()
예제 #5
0
    def _get_list_of_columns(columns, X, regex_match=False):
        """ retrieve the corresponding list of columns from the specified 'columns' attribute given by the user """

        if columns is None:
            list_columns = []

        elif isinstance(columns, str) and columns == "all":
            # 'columns' == 'all' ==> we keep all the columns
            list_columns = None

        elif isinstance(columns, str):

            if not isinstance(X, pd.DataFrame):
                raise TypeError(
                    "X should be a DataFrame when 'columns_to_use' or 'columns_to_drop' is a string : %s" % columns
                )

            if regex_match:
                raise ValueError(
                    "regex_match is True doesn't mean anything when 'columns' is a type : %s" % str(columns)
                )

            if columns in TypeOfVariables.alls:
                list_columns = [c for c in X.columns if guess_type_of_variable(X[c]) == columns]
            else:
                list_columns = list(X.select_dtypes(include=columns).columns)

        elif isinstance(columns, list):
            list_columns = columns

        elif isinstance(columns, np.ndarray):
            if columns.ndim != 1:
                raise TypeError("'columns_to_use' or 'columns_to_drop' should be a 1 dimensional array")

            list_columns = columns
        else:
            raise TypeError(
                "'columns_to_use' or 'columns_to_drop' should be either a string or a list and not %s"
                % str(type(columns))
            )

        return list_columns
예제 #6
0
def test_guess_type_of_variable_boolean():
    s = pd.Series([True, False, True, None] * 10)
    assert guess_type_of_variable(s) == TypeOfVariables.CAT

    s = pd.Series([True, False, True] * 10)
    assert guess_type_of_variable(s) == TypeOfVariables.CAT