def test_generic_hstack():
    df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]})
    df2 = pd.DataFrame({"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]})

    df12 = generic_hstack((df1, df2))
    assert get_type(df12) == DataTypes.DataFrame
    assert df12.shape == (10, 4)
    assert list(df12.columns) == ["a", "b", "c", "d"]

    df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]})
    df2 = pd.DataFrame(
        {"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]},
        index=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    )

    df12 = generic_hstack((df1, df2))
    assert np.array_equal(df12.index.values, np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]))
    assert get_type(df12) == DataTypes.DataFrame
    assert df12.shape == (10, 4)
    assert list(df12.columns) == ["a", "b", "c", "d"]

    df12 = generic_hstack((df1, df2), output_type=DataTypes.NumpyArray)
    assert get_type(df12) == DataTypes.NumpyArray
    assert df12.shape == (10, 4)

    with pytest.raises(ValueError):
        generic_hstack((df1.head(3), df2.head(4)))

    with pytest.raises(ValueError):
        generic_hstack((df1.head(3).values, df2.head(4)))

    with pytest.raises(ValueError):
        generic_hstack((df1.head(3).values, df2.head(4).values))
def test_conversion():

    np.random.seed(123)

    array1 = np.random.randn(10, 3)

    all_objects = {
        "a1": (array1, DataTypes.NumpyArray),
        "a2": (1 * (array1 > 0), DataTypes.NumpyArray),
        "a3": (array1[:, 1], DataTypes.NumpyArray),
        "df1": (pd.DataFrame(array1, columns=["A", "B", "C"]), DataTypes.DataFrame),
        "df2": (pd.DataFrame(1 * (array1 > 0), columns=["a", "b", "c"]), DataTypes.DataFrame),
        "s1": (sparse.csr_matrix(array1), DataTypes.SparseArray),
        "s2": (sparse.csr_matrix(1 * (array1 > 0)), DataTypes.SparseArray),
        # "dfs1":(pd.SparseDataFrame(sparse.csr_matrix(array1),columns=["A","B","C"]) , data_type.SparseDataFrame)
        # "dfs2":(pd.SparseDataFrame(sparse.csr_matrix(1*(array1 > 0)),columns=["a","b","c"]), data_type.SparseDataFrame)
    }

    for name, (obj, expected_type) in all_objects.items():
        assert get_type(obj) == expected_type

        converted = convert_to_dataframe(obj)
        assert get_type(converted) == DataTypes.DataFrame

        converted = convert_to_array(obj)
        assert get_type(converted) == DataTypes.NumpyArray

        converted = convert_to_sparsearray(obj)
        assert get_type(converted) == DataTypes.SparseArray

        # converted = convert_to_sparsedataframe(obj)
        # assert get_type(converted) == DataTypes.SparseDataFrame

    assert np.array_equal(convert_to_array(all_objects["df1"][0]), all_objects["a1"][0])
    assert np.array_equal(convert_to_array(all_objects["s1"][0]), all_objects["a1"][0])
def test_get_type():
    df = pd.DataFrame({"a": np.arange(10)})
    dfs = pd.SparseDataFrame({"a": [0, 0, 0, 1, 1]})

    assert get_type(df) == DataTypes.DataFrame
    assert get_type(df["a"]) == DataTypes.Serie
    assert get_type(df.values) == DataTypes.NumpyArray
    assert get_type(sparse.coo_matrix(df.values)) == DataTypes.SparseArray
    assert get_type(dfs) == DataTypes.SparseDataFrame
Example #4
0
    def fit_transform(self, X, y=None, **fit_params):
        if self.verbose:
            print("withing 'DebugPassThrought' fit_transform named %s" % self.name)
            if fit_params:
                print("fit_params given")
                print(fit_params)

        if self.debug:
            self._expected_type = dsh.get_type(X)
            self._expected_nbcols = dsh._nbcols(X)
            if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame):
                self._expected_columns = list(X.columns)

            self.fit_params = fit_params  # stored, just to help test

        Xres = X
        if self.column_prefix is not None:
            Xres = X.copy()
            Xres.columns = [self.column_prefix + "_" + c for c in Xres.columns]

        self._features = getattr(Xres, "columns", None)
        if self._features is not None:
            self._features = list(self._features)

        return Xres
Example #5
0
    def transform(self, X):

        check_is_fitted(self, "_all_mapping")

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("This transformer only works for DataFrame")

        if X.shape[1] != self._nb_columns:
            raise ValueError("X doesn't have the correct number of columns")

        all_res = []            
        for j in range(X.shape[1]):
            
            index_line = self._all_mapping[j].loc[X.iloc[:, j]].values
            index_col  = np.arange(len(self._all_mapping[j]) - 1, dtype=np.int32)
            
            assert index_col.ndim == 1
            assert index_col.ndim == 1
            
            res_j = (index_line[:,np.newaxis] > index_col[np.newaxis,:]).astype(self.dtype)
            
            all_res.append(res_j)
            

        result = np.concatenate(all_res, axis=1)
        
        return pd.DataFrame(result, columns = self.get_feature_names(), index=X.index)
Example #6
0
    def fit(self, X, y=None, **fit_params):
        if self.verbose:
            print("within 'DebugPassThrough' fit named %s" % self.name)
            if fit_params:
                print("fit_params given")
                print(fit_params)

        if self.debug:
            self._expected_type = dsh.get_type(X)
            self._expected_nbcols = dsh._nbcols(X)
            if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame):
                self._expected_columns = list(X.columns)

            self.fit_params = fit_params  # stored, just to help test

        if self.column_prefix is None:
            self._features = getattr(X, "columns", None)
            if self._features is not None:
                self._features = list(self._features)
        else:
            if hasattr(X, "columns"):
                self._features = [self.column_prefix + "_" + c for c in X.columns]
            else:
                self._features = None

        return self
Example #7
0
def test_conversion():

    np.random.seed(123)

    array1 = np.random.randn(10, 3)

    all_objects = {
        "a1": (array1, DataTypes.NumpyArray),
        "a2": (1 * (array1 > 0), DataTypes.NumpyArray),
        "a3": (array1[:, 1], DataTypes.NumpyArray),
        "df1": (pd.DataFrame(array1, columns=["A", "B",
                                              "C"]), DataTypes.DataFrame),
        "df2": (pd.DataFrame(1 * (array1 > 0),
                             columns=["a", "b", "c"]), DataTypes.DataFrame),
        "s1": (sparse.csr_matrix(array1), DataTypes.SparseArray),
        "s2": (sparse.csr_matrix(1 * (array1 > 0)), DataTypes.SparseArray),
        # "dfs1":(pd.SparseDataFrame(sparse.csr_matrix(array1),columns=["A","B","C"]) , data_type.SparseDataFrame)
        # "dfs2":(pd.SparseDataFrame(sparse.csr_matrix(1*(array1 > 0)),columns=["a","b","c"]), data_type.SparseDataFrame)
    }

    if _IS_PD1:
        df1_cat = all_objects["df1"][0].copy()
        df1_cat["A"] = df1_cat["A"].astype("category")

        all_objects["df1_cat"] = (df1_cat, DataTypes.DataFrame)

    for name, (obj, expected_type) in all_objects.items():

        assert get_type(obj) == expected_type

        converted = convert_to_dataframe(obj)
        assert get_type(converted) == DataTypes.DataFrame

        converted = convert_to_array(obj)
        assert get_type(converted) == DataTypes.NumpyArray
        assert converted.dtype.kind in ("i", "f")

        converted = convert_to_sparsearray(obj)
        assert get_type(converted) == DataTypes.SparseArray

        # converted = convert_to_sparsedataframe(obj)
        # assert get_type(converted) == DataTypes.SparseDataFrame

    assert np.array_equal(convert_to_array(all_objects["df1"][0]),
                          all_objects["a1"][0])
    assert np.array_equal(convert_to_array(all_objects["s1"][0]),
                          all_objects["a1"][0])
Example #8
0
    def fit(self, X, y=None):
        
        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("This transformer only works for DataFrame")
            
        if X.isnull().sum().sum() > 0:
            raise ValueError("This transformer doesn't handle null")

        self._nb_columns = X.shape[1]
            
        is_auto = isinstance(self.categories, str) and self.categories == "auto"
        
        if not is_auto:
            if len(self.categories) != X.shape[1]:
                raise TypeError("categories should be 'auto' or a list the same size as 'X.shape[1]'")

        all_mappings = []
        all_inv_mappings = []
        categories = []
        for j in range(X.shape[1]):

            current_category = None
            if not is_auto: # not automatic
                try:
                    current_category = self.categories[j] # try to find the names
                except KeyError:
                    pass

                if current_category is None:
                    try:
                        current_category = self.categories[X.columns[j]] # try again with name of column
                    except KeyError:
                        pass
                    
            if current_category is None or is_auto or (isinstance(current_category, str) and current_category == "auto"):
                target_classes_j = np.sort(np.unique(X.iloc[:, j].values))
            else:

                target_classes_j = np.array(current_category)
                uy_j = set(list(X.iloc[:, j].values))
                if len(set(list(uy_j)).difference(target_classes_j)) > 0:
                    raise ValueError("I have a categories that doesn't exist, please check")

            integers = np.arange(len(target_classes_j)).astype(self.dtype)
            mapping = pd.Series(integers, index = target_classes_j)
            inv_mapping = pd.Series(target_classes_j, index = integers)
            
            all_mappings.append(mapping)
            all_inv_mappings.append(inv_mapping)
            categories.append(target_classes_j)


        self.categories_ = categories
        self._all_mapping = all_mappings
        self._all_inv_mapping = all_inv_mappings
        
        self._input_features = list(X.columns)
        
        return self
Example #9
0
    def transform(self, X):

        self._check_is_fitted()

        Xtype = dsh.get_type(X)
        Xnbcols = dsh._nbcols(X)

        if self._expected_type != Xtype:
            raise ValueError(
                "I don't have the correct type as input, expected : %s, got : %s"
                % (self._expected_type, Xtype))

        if self._expected_nbcols != Xnbcols:
            raise ValueError(
                "I don't have the correct number of columns, expected : %d, got : %d"
                % (self._expected_nbcols, Xnbcols))

        if self._expected_type in (DataTypes.DataFrame,
                                   DataTypes.SparseDataFrame):
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %d isn't in the column of the DataFrame" %
                            l)

                return X.iloc[:, self._final_columns_to_use]
            else:

                set_col = set(X.columns)
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %s isn't in the column of the DataFrame" %
                            l)

                return X.loc[:, self._final_columns_to_use]

        else:
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError(
                            "Column %d isn't in the column of the DataFrame" %
                            l)

                return X[:, self._final_columns_to_use]

            else:
                raise ValueError(
                    "columns_to_use must be integers when type if array or sparseArray"
                )
Example #10
0
    def fit(self, X, y=None):

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        Xcolumns = list(X.columns)

        self._columns_to_encode = Xcolumns  # Force to encode everything now

        X = get_rid_of_categories(X)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        self.variable_modality_mapping = {col: self.modalities_filter(X[col]) for col in self._columns_to_encode}

        # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici

        if self.encoding_type == "num":
            self._feature_names = self._columns_to_encode

            self.columns_mapping = {c: [c] for c in self._feature_names}

        elif self.encoding_type == "dummy":

            self.columns_mapping = {}

            index_column = {}
            self._variable_shift = {}
            cum_max = 0
            for col in self._columns_to_encode:

                self.columns_mapping[col] = []

                for i, (mod, ind) in enumerate(self.variable_modality_mapping[col].items()):
                    index_column[ind + cum_max] = col + "__" + str(mod)

                    self.columns_mapping[col].append(col + "__" + str(mod))

                self._variable_shift[col] = cum_max
                cum_max += i + 1

            self._dummy_size = cum_max
            self._dummy_feature_names = [index_column[i] for i in range(cum_max)]
            self._feature_names = self._dummy_feature_names

        else:
            raise NotImplementedError("I don't know that type of encoding %s" % self.encoding_type)

        return self
Example #11
0
    def transform(self, X):

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")
        X = get_rid_of_categories(X)

        result = self._transform_aggregat(X, self._target_aggregat, self._target_aggregat_global)
        assert result.shape[1] == len(self.get_feature_names())

        return result
Example #12
0
    def transform(self, X):

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        X = get_rid_of_categories(X)

        result = self._transform_to_encode(X)

        return result
Example #13
0
def gen_slice(ob, sl):
    """ generic column slicer """
    t = get_type(ob)
    if t in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
        return ob.iloc[:, sl]
    elif t == DataTypes.SparseArray:
        if isinstance(ob, scipy.sparse.coo_matrix):
            ob = scipy.sparse.csc_matrix(ob.copy())
        return ob[:, sl]
    else:
        return ob[:, sl]
Example #14
0
    def transform(self, X):

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        result = self._transform_to_encode(X)

        if len(self._columns_to_keep) > 0:
            result_other = X.loc[:, self._columns_to_keep]
            return generic_hstack([result_other, result])
        else:
            return result
Example #15
0
    def transform(self, X):

        self._check_is_fitted()

        Xtype = dsh.get_type(X)
        Xnbcols = dsh._nbcols(X)

        if self._expected_type != Xtype:
            raise ValueError(
                "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
            )

        if self.raise_if_shape_differs and self._expected_nbcols != Xnbcols:
            raise ValueError(
                "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols)
            )
            # TODO : remove that check in some cases

        if self._return_data_as_inputed:
            return X  # So no copy is made

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %d isn't in the column of the DataFrame" % l)

                return X.iloc[:, self._final_columns_to_use]
            else:

                set_col = set(X.columns)
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %s isn't in the column of the DataFrame" % l)

                return X.loc[:, self._final_columns_to_use]

        else:
            if self._columns_to_use_is_integer:

                set_col = set(range(X.shape[1]))
                for l in self._final_columns_to_use:
                    if l not in set_col:
                        raise ValueError("Column %d isn't in the column of the DataFrame" % l)
                if isinstance(X, sps.coo_matrix):
                    return X.tocsc()[:, self._final_columns_to_use].tocoo()  # because COO matrix are not subscriptable
                else:
                    return X[:, self._final_columns_to_use]

            else:
                raise ValueError("columns_to_use must be integers when type if array or sparseArray")
def test_generic_hstack_sparse_and_category(with_cat, force_sparse):
    
    df = pd.DataFrame({"a":10+np.arange(10),"b":np.random.randn(10)})
    if with_cat:
        df["a"] = df["a"].astype("category")

    xx = convert_to_sparsearray(np.random.randint(0,1, size=(10,2)))

    concat = generic_hstack((df,xx), max_number_of_cells_for_non_sparse = 10 + (1-force_sparse) * 1000000)    
    
    assert concat.shape == (df.shape[0] , df.shape[1] + xx.shape[1])
    if force_sparse:
        assert get_type(concat) == DataTypes.SparseArray

    elif with_cat:
        assert concat.dtypes["a"] == "category"
        assert isinstance(concat, pd.DataFrame)
Example #17
0
    def transform(self, X):
        if self._scaler is None or self.components_ is None:
            raise NotFittedError("You should fit the model first")

        if get_type(X) != self._input_type:
            raise TypeError(
                "X should be a the same type as when fitted : %s, instead I got %s" % (self._input_type, type(X))
            )

        if X.shape[1] != self._nb_cols:
            raise ValueError(
                "X should have the same number of columns has when fitted (%d), instead I got %d"
                % (self._nb_cols, X.shape[1])
            )

        Xz = self._scaler.transform(X)
        Xzk_rot = np.dot(Xz, self.components_)

        return Xzk_rot
Example #18
0
def test_TruncatedSVDWrapper():

    df = get_sample_df(100, seed=123)
    cols = []
    for j in range(10):
        cols.append("num_col_%d" % j)
        df["num_col_%d" % j] = np.random.randn(df.shape[0])

    # 1) regular case : drop other columns
    svd = TruncatedSVDWrapper(n_components=5, columns_to_use=cols)
    res1 = svd.fit_transform(df)

    assert res1.shape == (100, 5)
    assert get_type(res1) == DataTypes.DataFrame
    assert list(res1.columns) == ["SVD__%d" % j for j in range(5)]
    assert not res1.isnull().any().any()
    assert svd.get_feature_names() == list(res1.columns)

    # 2) we keep the original columns as well
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=cols,
                              drop_used_columns=False,
                              drop_unused_columns=False)
    res2 = svd.fit_transform(df)

    assert res2.shape == (100, 5 + df.shape[1])

    assert get_type(res2) == DataTypes.DataFrame
    assert list(
        res2.columns) == list(df.columns) + ["SVD__%d" % j for j in range(5)]
    assert svd.get_feature_names() == list(
        df.columns) + ["SVD__%d" % j for j in range(5)]
    assert not res2.isnull().any().any()
    assert (res2.loc[:, list(df.columns)] == df).all().all()

    # 3) we keep only untouch columns
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=cols,
                              drop_used_columns=True,
                              drop_unused_columns=False)
    res3 = svd.fit_transform(df)
    assert res3.shape == (100, 3 + 5)
    assert list(res3.columns) == ["float_col", "int_col", "text_col"
                                  ] + ["SVD__%d" % j for j in range(5)]
    assert svd.get_feature_names() == ["float_col", "int_col", "text_col"
                                       ] + ["SVD__%d" % j for j in range(5)]
    assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] ==
             df.loc[:, ["float_col", "int_col", "text_col"]]).all().all())

    ###################################
    ###  same thing but with regex  ###
    ###################################

    # 1) Regular case : 'drop' other columns
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=["num_col_"],
                              regex_match=True)
    res1 = svd.fit_transform(df)
    assert res1.shape == (100, 5)
    assert get_type(res1) == DataTypes.DataFrame
    assert list(res1.columns) == ["SVD__%d" % j for j in range(5)]
    assert not res1.isnull().any().any()
    assert svd.get_feature_names() == list(res1.columns)

    # 2) Keep original columns
    svd = TruncatedSVDWrapper(
        n_components=5,
        columns_to_use=["num_col_"],
        drop_used_columns=False,
        drop_unused_columns=False,
        regex_match=True,
    )
    res2 = svd.fit_transform(df)

    assert res2.shape == (100, 5 + df.shape[1])

    assert get_type(res2) == DataTypes.DataFrame
    assert list(
        res2.columns) == list(df.columns) + ["SVD__%d" % j for j in range(5)]
    assert svd.get_feature_names() == list(
        df.columns) + ["SVD__%d" % j for j in range(5)]
    assert not res2.isnull().any().any()
    assert (res2.loc[:, list(df.columns)] == df).all().all()

    # 3) Keep only the un-touch column
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=["num_col_"],
                              drop_used_columns=True,
                              drop_unused_columns=False,
                              regex_match=True)
    res3 = svd.fit_transform(df)
    assert res3.shape == (100, 3 + 5)
    assert list(res3.columns) == ["float_col", "int_col", "text_col"
                                  ] + ["SVD__%d" % j for j in range(5)]
    assert svd.get_feature_names() == ["float_col", "int_col", "text_col"
                                       ] + ["SVD__%d" % j for j in range(5)]
    assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] ==
             df.loc[:, ["float_col", "int_col", "text_col"]]).all().all())

    # Delta with numpy ###
    xx = df.values
    columns_to_use = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=columns_to_use,
                              drop_used_columns=True,
                              drop_unused_columns=False)
    res4 = svd.fit_transform(xx)
    assert list(res4.columns) == [0, 1, 2] + ["SVD__%d" % i for i in range(5)]
    assert svd.get_feature_names() == [0, 1, 2
                                       ] + ["SVD__%d" % i for i in range(5)]

    input_features = ["COL_%d" % i for i in range(xx.shape[1])]
    assert svd.get_feature_names(input_features) == [
        "COL_0", "COL_1", "COL_2"
    ] + ["SVD__%d" % i for i in range(5)]

    # Keep
    svd = TruncatedSVDWrapper(n_components=5,
                              columns_to_use=columns_to_use,
                              drop_used_columns=False,
                              drop_unused_columns=False)
    res2 = svd.fit_transform(xx)
    assert list(res2.columns) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
                                  ] + ["SVD__%d" % i for i in range(5)]
    assert svd.get_feature_names() == [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
    ] + ["SVD__%d" % i for i in range(5)]
    assert svd.get_feature_names(
        input_features) == input_features + ["SVD__%d" % i for i in range(5)]
Example #19
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ### Columns to use ###
        if self.columns_to_use is None:
            list_columns_to_use = None  # [i for i in range(self._expected_nbcols)]
        else:
            list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use)

        ### Columns to drop ###
        if self.columns_to_drop is None:
            list_columns_to_drop = None
        else:
            list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop)

        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            raise ValueError("columns_to_use is empty")

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

        else:

            if is_int:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self
Example #20
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ######################################
        ### Special case : keep everything ###
        ######################################
        self._return_data_as_inputed = False
        if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None:
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = list(range(X.shape[0]))
            self._return_data_as_inputed = True
            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

        ### Columns to use ###
        list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match)
        list_columns_to_drop = self._get_list_of_columns(
            columns=self.columns_to_drop, X=X, regex_match=self.regex_match
        )

        #################################
        ### Special case : no columns ###
        #################################
        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            # This means that there is nothing to do : no columns will be kept
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = []

            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

            return self

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                else:
                    final_columns_to_use = []

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                    else:
                        final_columns_to_use = []

        else:

            if is_int or is_int is None:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)
                else:
                    final_columns_to_use = []

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self
Example #21
0
    def fit(self, X, y):

        if y is None:
            raise ValueError("I need a value for 'y'")

        self._random_gen = check_random_state(self.random_state)

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")
        Xcolumns = list(X.columns)

        if not isinstance(y, pd.Series):
            sy = pd.Series(y)
        else:
            sy = y

        # Columns to encode and to keep

        self._columns_to_encode = list(X.columns)

        X = get_rid_of_categories(X)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        self._columns_to_keep = []

        # Verif:
        if not isinstance(self._columns_to_keep, list):
            raise TypeError("_columns_to_keep should be a list")

        for c in self._columns_to_keep:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        # Target information
        if self.is_regression:

            self.target_classes = None  # No target classes for Regressor
            self.global_std = np.std(sy)

        else:
            # For classification I need to store it
            self.global_std = None
            self.target_classes = list(np.unique(sy))

            if len(self.target_classes) == 2:
                self.target_classes = self.target_classes[1:]

        # Columns on which we want None to be a special modality
        self._na_to_null = dict()
        for col in self._columns_to_encode:
            ii_null = X[col].isnull()
            self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X)

        self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None)

        # Features names
        self._feature_names = [c for c in self._columns_to_keep]  # copy
        for col in self._columns_to_encode:
            self._feature_names += self._get_output_column_name(col=col, target_classes=self.target_classes)
            # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes]

        return self
Example #22
0
def verif_encoder_with_data(klass, enc_kwargs, df1, df2, y1, fit_type,
                            additional_conversion_fun, extended_all_types):
    """ verification of the behavior of a transform on data """
    # Conversion of input into a different type
    df1_conv = convert_generic(df1, output_type=fit_type)
    df2_conv = convert_generic(df2, output_type=fit_type)

    if additional_conversion_fun is not None:
        df1_conv = additional_conversion_fun(df1_conv)
        df2_conv = additional_conversion_fun(df2_conv)

    if y1 is None:
        encoder = klass(**enc_kwargs)
        df1_transformed_a = encoder.fit_transform(
            df1_conv)  # 1st test without explicity an y..
        df2_transformed_a = encoder.transform(df2_conv)

    encoder_a = klass(**enc_kwargs)
    params_0 = encoder_a.get_params()

    df1_transformed_a = encoder_a.fit_transform(
        df1_conv, y=y1)  # Other test with an y (might be None or not)
    df2_transformed_a = encoder_a.transform(df2_conv)

    params_3 = encoder_a.get_params()
    # Rmk : might no be enforce ON all transformeurs
    rec_assert_equal(params_0,
                     params_3)  # Verif that get_params didn't change after fit

    assert df1_transformed_a is not None  # verify that something was created
    assert df2_transformed_a is not None  # verify that something was created

    encoder_cloned = clone(encoder_a)  # Clone again ...

    assert_raise_not_fitted(
        encoder_cloned, df2_conv
    )  # ... and verify that the clone isn't fitted, even if encoder_a is fitted

    # Same thing but using ... fit and then... transformed
    encoder_b = klass(**enc_kwargs)
    encoder_b.fit(df1_conv, y=y1)
    df1_transformed_b = encoder_b.transform(df1_conv)
    df2_transformed_b = encoder_b.transform(df2_conv)

    assert df1_transformed_b is not None
    assert df2_transformed_b is not None

    # Same thing but using clone
    encoder_c = clone(encoder_a)
    df1_transformed_c = encoder_c.fit_transform(df1_conv, y=y1)
    df2_transformed_c = encoder_c.transform(df2_conv)

    # Samething but using empyt class + set_params
    encoder_d = klass()
    encoder_d.set_params(**enc_kwargs)
    df1_transformed_d = encoder_d.fit_transform(df1_conv, y=y1)
    df2_transformed_d = encoder_d.transform(df2_conv)

    # Verif that when passed with the wrong number of columns
    assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1, None)))

    for fit_type2, additional_conversion_fun2 in extended_all_types:

        if fit_type == fit_type2:
            continue

        df1_conv2 = convert_generic(df1_conv, output_type=fit_type2)

        # Verif that is I have a different type that what was present during the fit I'll raise an error

        assert_raise_value_error(encoder_a, df1_conv2)
        assert_raise_value_error(encoder_b, df1_conv2)
        assert_raise_value_error(encoder_c, df1_conv2)
        assert_raise_value_error(encoder_d, df1_conv2)

    # Verif shape
    # Nb of rows ...
    assert df1_transformed_a.shape[0] == df1_conv.shape[0]
    assert df1_transformed_b.shape[0] == df1_conv.shape[0]
    assert df1_transformed_c.shape[0] == df1_conv.shape[0]
    assert df1_transformed_d.shape[0] == df1_conv.shape[0]

    assert df2_transformed_a.shape[0] == df2_conv.shape[0]
    assert df2_transformed_b.shape[0] == df2_conv.shape[0]
    assert df2_transformed_c.shape[0] == df2_conv.shape[0]
    assert df2_transformed_d.shape[0] == df2_conv.shape[0]

    # Nb of columns : all the same
    assert df1_transformed_b.shape[1] == df1_transformed_a.shape[1]
    assert df1_transformed_c.shape[1] == df1_transformed_a.shape[1]
    assert df1_transformed_d.shape[1] == df1_transformed_a.shape[1]

    assert df2_transformed_a.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_b.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_c.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_d.shape[1] == df1_transformed_a.shape[1]

    # Verif type
    assert get_type(df2_transformed_a) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_b) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_b) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_c) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_c) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_d) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_d) == get_type(df1_transformed_a)

    # if 'desired_output_type' present, check output type is what it seems
    if "desired_output_type" in enc_kwargs:
        assert get_type(df1_transformed_a) == enc_kwargs["desired_output_type"]

    if getattr(encoder_a, "desired_output_type", None) is not None:
        assert get_type(df1_transformed_a) == encoder_a.desired_output_type

    # Verif columns
    if get_type(df1_transformed_b) in (DataTypes.DataFrame,
                                       DataTypes.SparseDataFrame):
        assert list(df2_transformed_a.columns) == list(
            df1_transformed_a.columns)

        assert list(df1_transformed_b.columns) == list(
            df1_transformed_a.columns)
        assert list(df2_transformed_b.columns) == list(
            df1_transformed_a.columns)

        assert list(df1_transformed_c.columns) == list(
            df1_transformed_a.columns)
        assert list(df2_transformed_c.columns) == list(
            df1_transformed_a.columns)

        assert list(df2_transformed_d.columns) == list(
            df1_transformed_a.columns)
        assert list(df1_transformed_d.columns) == list(
            df1_transformed_a.columns)

        assert encoder_a.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_b.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_c.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_d.get_feature_names() == list(df1_transformed_a.columns)

    # Verif index
    if get_type(df1_transformed_b) in (DataTypes.DataFrame,
                                       DataTypes.SparseDataFrame):
        assert (df1_transformed_b.index == df1_transformed_a.index).all()
        assert (df2_transformed_b.index == df2_transformed_a.index).all()

        assert (df1_transformed_c.index == df1_transformed_a.index).all()
        assert (df2_transformed_c.index == df2_transformed_a.index).all()

        assert (df1_transformed_d.index == df1_transformed_a.index).all()
        assert (df2_transformed_d.index == df2_transformed_a.index).all()

        if fit_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            assert (df1_transformed_a.index == df1_conv.index).all()
            assert (df2_transformed_a.index == df2_conv.index).all()
Example #23
0
def test__NumImputer():

    xx, xxd, xxs = get_sample_data(add_na=True)
    xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14])

    # DataFrame entry
    for inp in (_NumImputer(), NumImputer(), _NumImputer(add_is_null=False),
                NumImputer(add_is_null=False)):
        xx_out = inp.fit_transform(xxd)
        assert (xx_out.index == xxd.index).all()
        assert pd.isnull(xxd.loc[0, "col1"])  # Verify that it is still null
        assert xx_out.isnull().sum().sum() == 0
        assert xx_out["col1"][0] == xxd.loc[~xxd["col1"].isnull(),
                                            "col1"].mean()

        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xxd)

        if inp.add_is_null:
            assert inp.get_feature_names() == [
                "col0", "col1", "col2", "col3", "col4", "col5", "col6",
                "col1_isnull"
            ]
            assert xx_out.shape[1] == 1 + xxd.shape[1]
            assert xx_out["col1_isnull"].iloc[0] == 1
            assert xx_out["col1_isnull"].iloc[5] == 1
            assert (xx_out["col1_isnull"].iloc[np.array(
                [1, 2, 3, 4, 6, 7, 8, 9])] == 0).all()

        else:
            assert xx_out.shape[1] == xxd.shape[1]
            assert inp.get_feature_names() == [
                "col0", "col1", "col2", "col3", "col4", "col5", "col6"
            ]

        inp = _NumImputer(add_is_null=False, allow_unseen_null=False)
        inp.fit(xxd)
        xxd2 = xxd.copy()
        xxd2.iloc[0, 3] = np.nan
        try:
            inp.transform(xxd2)
            raise AssertionError("Model should have fail its transformation")
        except ValueError:
            pass

    input_features = ["COL_%d" % i for i in range(xx.shape[1])]
    # Numpy array
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xx)
        assert pd.isnull(xx[0, 1])
        assert pd.isnull(xx_out).sum() == 0
        assert xx_out.shape[1] == 1 + xx.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xx)
        assert inp.get_feature_names() == [
            "0", "1", "2", "3", "4", "5", "6", "1_isnull"
        ]
        assert inp.get_feature_names(
            input_features) == input_features + ["COL_1_isnull"]
        assert xx_out[0, 7] == 1
        assert xx_out[5, 7] == 1
        assert (xx_out[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all()

    # Sparse Array
    for inp in (_NumImputer(), NumImputer()):
        for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix):
            xxsf = f(xxs.copy())
            xx_out = inp.fit_transform(xxsf)
            assert pd.isnull(xxs[0, 1])
            assert pd.isnull(xx_out.todense()).sum() == 0
            assert get_type(xx_out) == get_type(xxs)
            assert xx_out.shape[1] == 1 + xxs.shape[1]
            assert xx_out.shape[0] == xx.shape[0]
            assert inp.get_feature_names() == [
                "0", "1", "2", "3", "4", "5", "6", "1_isnull"
            ]
            assert inp.get_feature_names(
                input_features) == input_features + ["COL_1_isnull"]
            assert xx_out.todense()[0, 7] == 1
            assert xx_out.todense()[0, 7] == 1
            assert (xx_out.todense()[np.array([1, 2, 3, 4, 6, 7, 8, 9]),
                                     7] == 0).all()

    xx, xxd, xxs = get_sample_data(add_na=False)
    xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14])

    # DataFrame entry
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xxd)
        assert (xx_out.index == xxd.index).all()
        assert xx_out.isnull().sum().sum() == 0
        assert xx_out.shape[1] == xxd.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xxd)
        assert inp.get_feature_names() == [
            "col0", "col1", "col2", "col3", "col4", "col5", "col6"
        ]

    # Numpy array
    for inp in (_NumImputer(), NumImputer()):
        xx_out = inp.fit_transform(xx)
        assert pd.isnull(xx_out).sum() == 0
        assert xx_out.shape[1] == xx.shape[1]
        assert xx_out.shape[0] == xx.shape[0]
        assert get_type(xx_out) == get_type(xx)
        assert inp.get_feature_names() == ["0", "1", "2", "3", "4", "5", "6"]
        assert inp.get_feature_names(
            input_features=input_features) == input_features

    # Sparse Array
    for inp in (_NumImputer(), NumImputer()):
        for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix):
            xxs_f = f(xxs.copy())
            xx_out = inp.fit_transform(xxs_f)
            assert pd.isnull(xx_out.todense()).sum() == 0
            assert get_type(xx_out) == get_type(xxs)
            assert xx_out.shape[1] == xxs.shape[1]
            assert xx_out.shape[0] == xx.shape[0]
            assert inp.get_feature_names() == [
                "0", "1", "2", "3", "4", "5", "6"
            ]
            assert inp.get_feature_names(
                input_features=input_features) == input_features
Example #24
0
def test_PCAWrapper():
    df = get_sample_df(100, seed=123)
    cols = []
    for j in range(10):
        cols.append("num_col_%d" % j)
        df["num_col_%d" % j] = np.random.randn(df.shape[0])

    # 0) n_components > n_features
    pca = PCAWrapper(n_components=15, columns_to_use=cols)
    res0 = pca.fit_transform(df)

    assert res0.shape == (100, len(cols) - 1)
    assert get_type(res0) == DataTypes.DataFrame
    assert list(res0.columns) == ["PCA__%d" % j for j in range(len(cols) - 1)]
    assert not res0.isnull().any().any()
    assert pca.get_feature_names() == list(res0.columns)

    # 1) regular case : drop other columns
    pca = PCAWrapper(n_components=5, columns_to_use=cols)
    res1 = pca.fit_transform(df)

    assert res1.shape == (100, 5)
    assert get_type(res1) == DataTypes.DataFrame
    assert list(res1.columns) == ["PCA__%d" % j for j in range(5)]
    assert not res1.isnull().any().any()
    assert pca.get_feature_names() == list(res1.columns)

    # 2) we keep the original columns as well
    pca = PCAWrapper(n_components=5,
                     columns_to_use=cols,
                     keep_other_columns="keep")
    res2 = pca.fit_transform(df)

    assert res2.shape == (100, 5 + df.shape[1])

    assert get_type(res2) == DataTypes.DataFrame
    assert list(
        res2.columns) == list(df.columns) + ["PCA__%d" % j for j in range(5)]
    assert pca.get_feature_names() == list(
        df.columns) + ["PCA__%d" % j for j in range(5)]
    assert not res2.isnull().any().any()
    assert (res2.loc[:, list(df.columns)] == df).all().all()

    # 3) Keep only the un-touch column
    pca = PCAWrapper(n_components=5,
                     columns_to_use=["num_col_"],
                     keep_other_columns="delta",
                     regex_match=True)
    res3 = pca.fit_transform(df)
    assert res3.shape == (100, 3 + 5)
    assert list(res3.columns) == ["float_col", "int_col", "text_col"
                                  ] + ["PCA__%d" % j for j in range(5)]
    assert pca.get_feature_names() == ["float_col", "int_col", "text_col"
                                       ] + ["PCA__%d" % j for j in range(5)]
    assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] ==
             df.loc[:, ["float_col", "int_col", "text_col"]]).all().all())

    # Delta with numpy ###
    xx = df.values
    columns_to_use = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    pca = PCAWrapper(n_components=5,
                     columns_to_use=columns_to_use,
                     keep_other_columns="delta")
    res4 = pca.fit_transform(xx)
    assert list(res4.columns) == [0, 1, 2] + ["PCA__%d" % i for i in range(5)]
    assert pca.get_feature_names() == [0, 1, 2
                                       ] + ["PCA__%d" % i for i in range(5)]

    input_features = ["COL_%d" % i for i in range(xx.shape[1])]
    assert pca.get_feature_names(input_features) == [
        "COL_0", "COL_1", "COL_2"
    ] + ["PCA__%d" % i for i in range(5)]

    # Keep
    pca = PCAWrapper(n_components=5,
                     columns_to_use=columns_to_use,
                     keep_other_columns="keep")
    res2 = pca.fit_transform(xx)
    assert list(res2.columns) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
                                  ] + ["PCA__%d" % i for i in range(5)]
    assert pca.get_feature_names() == [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
    ] + ["PCA__%d" % i for i in range(5)]
    assert pca.get_feature_names(
        input_features) == input_features + ["PCA__%d" % i for i in range(5)]
Example #25
0
def verif_model(df1, df2, y1, klass, model_kwargs, all_types, is_classifier):
    """ helper function that check (using asserts) a bunch a thing on a model klass
    
    Parameters
    ----------
    
    df1 : array like
        data on which model will be trained
    
    df2 : array like
        data on which model will be tested
        
    klass : type
        type of the model to test
        
    model_kwargs : dict
        kwargs to be passed to klass to create a model
        
    all_types : list of type
        list of input type to test the models on
        
    is_classifier : boolean
        if True the model is a Classifier otherwise a Regressor
   
    
    """

    if not isinstance(all_types, (list, tuple)):
        all_types = (all_types, )

    model0 = klass(**model_kwargs)  # Create an object ...
    model1 = clone(model0)  # then try to clone it

    model2 = klass()  # Create an empty object and then set its params
    model2.set_params(**model_kwargs)

    # Verify type are iddentical
    assert type(model0) == type(model1)
    assert type(model0) == type(model2)

    assert hasattr(klass, "fit")
    assert hasattr(klass, "predict")
    if is_classifier:
        assert hasattr(klass, "predict_proba")

    # Verify get_params are identical
    params_0 = model0.get_params()
    params_1 = model1.get_params()
    params_2 = model2.get_params()

    rec_assert_equal(params_0, params_1)
    rec_assert_equal(params_0, params_2)

    rec_assert_equal({k: v
                      for k, v in params_0.items() if k in model_kwargs},
                     model_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_1.items() if k in model_kwargs},
                     model_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_2.items() if k in model_kwargs},
                     model_kwargs)

    extended_all_types = extend_all_type(all_types)

    if is_classifier:
        yclasses = list(set(np.unique(y1)))
        nb_classes = len(yclasses)

    for fit_type, additional_conversion_fun in extended_all_types:

        # Convert inputs into several type ..
        df1_conv = convert_generic(df1, output_type=fit_type)
        df2_conv = convert_generic(df2, output_type=fit_type)

        if additional_conversion_fun is not None:
            df1_conv = additional_conversion_fun(df1_conv)
            df2_conv = additional_conversion_fun(df2_conv)

        model_a = klass(**model_kwargs)
        model_a.fit(df1_conv, y=y1)

        y1_hat_a = model_a.predict(
            df1_conv)  # Other test with an y (might be None or not)
        y2_hat_a = model_a.predict(df2_conv)

        if is_classifier:
            y1_hatproba_a = model_a.predict_proba(df1_conv)
            y2_hatproba_a = model_a.predict_proba(df2_conv)

        params_3 = model_a.get_params(
        )  # Verif that get_params didn't change after fit
        # Rmk : might no be enforce ON all transformeurs

        rec_assert_equal(params_0, params_3)

        assert y1_hat_a is not None  # verify that something was created
        assert y2_hat_a is not None  # verify that something was created

        model_cloned = clone(model_a)  # Clone again ...
        assert_raise_not_fitted(
            model_cloned, df2_conv
        )  # ... and verify that the clone isn't fitted, even if model_a is fitted

        # Same thing but using clone
        model_b = clone(model_a)
        model_b.fit(df1_conv, y=y1)

        y1_hat_b = model_b.predict(df1_conv)
        y2_hat_b = model_b.predict(df2_conv)
        if is_classifier:
            y1_hatproba_b = model_b.predict_proba(df1_conv)
            y2_hatproba_b = model_b.predict_proba(df2_conv)

        # Same thing but with set_params
        model_c = klass()
        model_c.set_params(**model_kwargs)
        model_c.fit(df1_conv, y=y1)

        y1_hat_c = model_c.predict(df1_conv)
        y2_hat_c = model_c.predict(df2_conv)

        if is_classifier:
            y1_hatproba_c = model_c.predict_proba(df1_conv)
            y2_hatproba_c = model_c.predict_proba(df2_conv)

        # check error when call with too few columns
        assert_raise_value_error(model_a, gen_slice(df1_conv, slice(1, None)))
        assert_raise_value_error(model_b, gen_slice(df1_conv, slice(1, None)))
        assert_raise_value_error(model_c, gen_slice(df1_conv, slice(1, None)))

        assert y1_hat_a.shape[0] == df1_conv.shape[0]
        assert y1_hat_b.shape[0] == df1_conv.shape[0]
        assert y1_hat_c.shape[0] == df1_conv.shape[0]

        assert y2_hat_a.shape[0] == df2_conv.shape[0]
        assert y2_hat_b.shape[0] == df2_conv.shape[0]
        assert y2_hat_c.shape[0] == df2_conv.shape[0]

        assert y1_hat_a.ndim == y1.ndim
        assert y1_hat_b.ndim == y1.ndim
        assert y1_hat_c.ndim == y1.ndim

        assert y2_hat_a.ndim == y1.ndim
        assert y2_hat_b.ndim == y1.ndim
        assert y2_hat_c.ndim == y1.ndim

        if is_classifier:
            assert y1_hatproba_a.ndim == 2
            assert y1_hatproba_b.ndim == 2
            assert y1_hatproba_c.ndim == 2
            assert y2_hatproba_a.ndim == 2
            assert y2_hatproba_b.ndim == 2
            assert y2_hatproba_c.ndim == 2

            y1_hatproba_a.shape[1] == nb_classes
            y1_hatproba_b.shape[1] == nb_classes
            y1_hatproba_c.shape[1] == nb_classes

            y2_hatproba_a.shape[1] == nb_classes
            y2_hatproba_b.shape[1] == nb_classes
            y2_hatproba_c.shape[1] == nb_classes

            assert hasattr(model_a, "classes_")
            assert hasattr(model_b, "classes_")
            assert hasattr(model_c, "classes_")

            assert list(set(model_a.classes_)) == list(set(yclasses))
            assert list(set(model_b.classes_)) == list(set(yclasses))
            assert list(set(model_c.classes_)) == list(set(yclasses))

            for f in (check_all_numerical, check_between_01, check_no_null):

                f(y1_hatproba_a)
                f(y1_hatproba_b)
                f(y1_hatproba_c)

                f(y2_hatproba_a)
                f(y2_hatproba_b)
                f(y2_hatproba_c)

        # Verif type
        assert get_type(y1_hat_b) == get_type(y1_hat_a)
        assert get_type(y1_hat_c) == get_type(y1_hat_a)
        assert get_type(y2_hat_a) == get_type(y1_hat_a)
        assert get_type(y2_hat_b) == get_type(y1_hat_a)
        assert get_type(y2_hat_c) == get_type(y1_hat_a)
Example #26
0
    def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None):
        """ internal method that handle the fit and the transform """

        if fit_params is None:
            fit_params = {}

        if is_fit:
            if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto":
                columns = self._get_default_columns_to_use(X, y)
                self.selector = ColumnsSelector(columns_to_use=columns)
            else:
                self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match)

        if hasattr(X, "shape"):
            if X.shape[0] == 0:
                raise ValueError("the X object has 0 rows")

        Xindex = dsh._get_index(X)  # if X has an index retrieve it
        #        if self.columns_to_use is not None:
        if is_fit:
            Xsubset = self.selector.fit_transform(X)
        else:
            Xsubset = self.selector.transform(X)
        # TODO (maybe): here allow a preprocessing pipeline
        #        if self.has_preprocessing:
        #            if is_fit:
        #                self.preprocessing = self._get_preprocessing()
        #                Xsubset = self.preprocessing.fit_transform(Xsubset)
        #            else:
        #                Xsubset = self.preprocessing.transform(Xsubset)

        # Store columns and shape BEFORE any modification
        if self.selector is not None:
            Xsubset_columns = self.selector.get_feature_names()
        else:
            raise NotImplementedError("should not go there anymore")
            # Xsubset_columns = getattr(Xsubset, "columns", None)

        Xsubset_shape = getattr(Xsubset, "shape", None)
        # TODO : ici utiliser d'une facon ou d'une autre un '
        # https://github.com/scikit-learn/scikit-learn/issues/6425

        if is_fit:
            self._expected_type = dsh.get_type(Xsubset)
            self._expected_nbcols = dsh._nbcols(Xsubset)
            self._expected_columns = dsh._get_columns(Xsubset)

        else:
            Xtype = dsh.get_type(Xsubset)
            if Xtype != self._expected_type:
                raise ValueError(
                    "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
                )

            nbcols = dsh._nbcols(Xsubset)
            if nbcols != self._expected_nbcols:
                raise ValueError(
                    "I don't have the correct nb of colmns as input, expected : %d, got : %d"
                    % (self._expected_nbcols, nbcols)
                )

            columns = dsh._get_columns(Xsubset)
            expected_columns = getattr(self, "_expected_columns", None)  # to allow pickle compatibility

            if expected_columns is not None and columns is not None and columns != self._expected_columns:
                raise ValueError("I don't have the correct names of columns")

        if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types:
            Xsubset = dsh.convert_generic(
                Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0]
            )

        if is_fit:
            self._verif_params()
            self._empty_data = False
            s = getattr(Xsubset, "shape", None)
            if s is not None and len(s) > 1 and s[1] == 0:
                self._empty_data = True

        if self.all_columns_at_once or self._empty_data:

            if is_fit:
                self._model = self._get_model(Xsubset, y)

            ##############################################
            ### Apply the model on ALL columns at ONCE ###
            ##############################################

            if self.work_on_one_column_only:
                Xsubset = dsh.make1dimension(Xsubset)  # will generate an error if 2 dimensions
            else:
                Xsubset = dsh.make2dimensions(Xsubset)

            # Call to underlying model
            Xres = None
            if is_fit and is_transform:
                ##############################
                ###  fit_transform method  ###
                ##############################
                # test if the the data to transform actually has some columns

                if not self._empty_data:
                    # normal case
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    # It means there is no columns to transform
                    Xres = Xsubset  # don't do anything

            elif is_fit and not is_transform:
                ####################
                ###  fit method  ###
                ####################
                if self.must_transform_to_get_features_name:
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    self._model.fit(Xsubset, y, **fit_params)
            else:
                ####################
                ###  transform   ###
                ####################
                if not self._empty_data:
                    Xres = self._model.transform(Xsubset)
                else:
                    Xres = Xsubset

            if is_fit:
                self._columns_informations = {
                    "output_columns": getattr(Xres, "columns", None),  # names of transformed columns if exist
                    "output_shape": getattr(Xres, "shape", None),  # shape of transformed result if exist
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once(
                    output_columns=self._columns_informations["output_columns"],
                    output_shape=self._columns_informations["output_shape"],
                    input_columns=self._columns_informations["input_columns"],
                    input_shape=self._columns_informations["input_shape"],
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        else:
            ########################################
            ### Apply the model COLUMN BY COLUMN ###
            ########################################
            if is_fit:
                self._models = []

            if is_transform or self.must_transform_to_get_features_name:
                all_Xres = []
            else:
                all_Xres = None

            Xsubset = dsh.make2dimensions(Xsubset)

            for j in range(self._expected_nbcols):

                if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie):
                    Xsubset_j = Xsubset.iloc[:, j]
                else:
                    Xsubset_j = Xsubset[:, j]

                if is_fit:
                    sub_model = self._get_model(Xsubset, y)
                    self._models.append(sub_model)
                else:
                    sub_model = self._models[j]

                if not self.work_on_one_column_only:
                    Xsubset_j = dsh.make2dimensions(Xsubset_j)

                if is_fit and is_transform:
                    # fit_transform method
                    Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)

                    all_Xres.append(Xres_j)

                elif is_fit and not is_transform:
                    # fit method
                    if self.must_transform_to_get_features_name:
                        Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)
                        all_Xres.append(Xres_j)

                    else:
                        sub_model.fit(Xsubset_j, y, **fit_params)

                elif is_transform:
                    # transform method

                    Xres_j = sub_model.transform(Xsubset_j)
                    all_Xres.append(Xres_j)

            if is_fit:

                self._columns_informations = {
                    "all_output_columns": None
                    if all_Xres is None
                    else [getattr(Xres, "columns", None) for Xres in all_Xres],
                    "all_output_shape": None
                    if all_Xres is None
                    else [getattr(Xres, "shape", None) for Xres in all_Xres],
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = list(
                    self.try_to_find_feature_names_separate(
                        all_output_columns=self._columns_informations["all_output_columns"],
                        all_output_shape=self._columns_informations["all_output_shape"],
                        input_columns=self._columns_informations["input_columns"],
                        input_shape=self._columns_informations["input_shape"],
                    )
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        if is_transform:
            if self._feature_names_for_transform is not None:
                ### LA ca marche pas en transform !!!
                Xres = dsh._set_columns(Xres, self._feature_names_for_transform)

        if is_transform:
            return Xres
        else:
            return self
Example #27
0
    def fit(self, X, y=None, **fit_params):

        # * save type
        self._input_type = get_type(X)
        self._nb_cols = X.shape[1]

        NF = X.shape[1]

        # * scale features
        self._scaler = StandardScaler(
            with_mean=self._input_type not in (DataTypes.SparseArray, DataTypes.SparseDataFrame)
        )
        Xz = self._scaler.fit_transform(X)

        # * random generator
        random_state = check_random_state(self.random_state)

        # Number of splits
        if self.max_nb_groups < 1:
            high = int(NF * self.max_nb_groups)
        else:
            high = min(int(self.max_nb_groups), NF - 1)

        self._nb_of_groups = random_state.randint(low=1, high=high)

        # all splits
        if self.max_group_size < 1:
            high_f = max(int(NF * self.max_group_size), 5)
        else:
            high_f = min(int(self.max_group_size), NF)

        FK = np.zeros((self._nb_of_groups, NF))
        for k in range(self._nb_of_groups):
            num_features = random_state.randint(1, high_f)
            rp = np.random.permutation(NF)
            FK[k, rp[0:num_features]] = 1

        components_ = np.zeros((NF, NF), dtype=Xz.dtype)

        n_samples = Xz.shape[0]

        for k in range(self._nb_of_groups):
            pos = np.nonzero(FK[k, :])[0]

            Xzk = Xz[:, pos]
            # TODO : subsample of class

            pca = PCA(n_components=len(pos), whiten=False, copy=True, random_state=self.random_state)

            if self.bootstrap:
                while True:
                    ii_to_keep = (
                        random_state.randn(n_samples) <= 0.63
                    )  # boostrap probability that an index is in a bootstrap sample (limit N -> inf)
                    index_to_keep = np.where(ii_to_keep)[0]
                    if len(index_to_keep) > 0:
                        # To prevent the (very unlickely) case where nothing is selected...)
                        break

                Xzk_bootstrap = Xzk[index_to_keep, :]
            else:
                Xzk_bootstrap = Xzk

            pca.fit(Xzk_bootstrap)

            rot = pca.components_.T
            assert rot.shape[0] == len(pos)
            if rot.shape[1] < len(pos):
                rot = np.hstack((rot, np.zeros((rot.shape[0], len(pos) - rot.shape[1]), dtype=rot.dtype)))

            assert rot.shape[0] == rot.shape[1]
            assert rot.shape[0] == len(pos)

            components_[pos.reshape(len(pos), 1), pos.reshape(1, len(pos))] = rot

        features_to_keep = np.any(components_ != 0, axis=0)
        self.components_ = components_[:, features_to_keep].astype(Xz.dtype)

        self._feature_names = ["RPCA_%d" % i for i in range(self.components_.shape[1])]

        return self