def test_generic_hstack(): df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df2 = pd.DataFrame({"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df12 = generic_hstack((df1, df2)) assert get_type(df12) == DataTypes.DataFrame assert df12.shape == (10, 4) assert list(df12.columns) == ["a", "b", "c", "d"] df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df2 = pd.DataFrame( {"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}, index=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19], ) df12 = generic_hstack((df1, df2)) assert np.array_equal(df12.index.values, np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])) assert get_type(df12) == DataTypes.DataFrame assert df12.shape == (10, 4) assert list(df12.columns) == ["a", "b", "c", "d"] df12 = generic_hstack((df1, df2), output_type=DataTypes.NumpyArray) assert get_type(df12) == DataTypes.NumpyArray assert df12.shape == (10, 4) with pytest.raises(ValueError): generic_hstack((df1.head(3), df2.head(4))) with pytest.raises(ValueError): generic_hstack((df1.head(3).values, df2.head(4))) with pytest.raises(ValueError): generic_hstack((df1.head(3).values, df2.head(4).values))
def test_conversion(): np.random.seed(123) array1 = np.random.randn(10, 3) all_objects = { "a1": (array1, DataTypes.NumpyArray), "a2": (1 * (array1 > 0), DataTypes.NumpyArray), "a3": (array1[:, 1], DataTypes.NumpyArray), "df1": (pd.DataFrame(array1, columns=["A", "B", "C"]), DataTypes.DataFrame), "df2": (pd.DataFrame(1 * (array1 > 0), columns=["a", "b", "c"]), DataTypes.DataFrame), "s1": (sparse.csr_matrix(array1), DataTypes.SparseArray), "s2": (sparse.csr_matrix(1 * (array1 > 0)), DataTypes.SparseArray), # "dfs1":(pd.SparseDataFrame(sparse.csr_matrix(array1),columns=["A","B","C"]) , data_type.SparseDataFrame) # "dfs2":(pd.SparseDataFrame(sparse.csr_matrix(1*(array1 > 0)),columns=["a","b","c"]), data_type.SparseDataFrame) } for name, (obj, expected_type) in all_objects.items(): assert get_type(obj) == expected_type converted = convert_to_dataframe(obj) assert get_type(converted) == DataTypes.DataFrame converted = convert_to_array(obj) assert get_type(converted) == DataTypes.NumpyArray converted = convert_to_sparsearray(obj) assert get_type(converted) == DataTypes.SparseArray # converted = convert_to_sparsedataframe(obj) # assert get_type(converted) == DataTypes.SparseDataFrame assert np.array_equal(convert_to_array(all_objects["df1"][0]), all_objects["a1"][0]) assert np.array_equal(convert_to_array(all_objects["s1"][0]), all_objects["a1"][0])
def test_get_type(): df = pd.DataFrame({"a": np.arange(10)}) dfs = pd.SparseDataFrame({"a": [0, 0, 0, 1, 1]}) assert get_type(df) == DataTypes.DataFrame assert get_type(df["a"]) == DataTypes.Serie assert get_type(df.values) == DataTypes.NumpyArray assert get_type(sparse.coo_matrix(df.values)) == DataTypes.SparseArray assert get_type(dfs) == DataTypes.SparseDataFrame
def fit_transform(self, X, y=None, **fit_params): if self.verbose: print("withing 'DebugPassThrought' fit_transform named %s" % self.name) if fit_params: print("fit_params given") print(fit_params) if self.debug: self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame): self._expected_columns = list(X.columns) self.fit_params = fit_params # stored, just to help test Xres = X if self.column_prefix is not None: Xres = X.copy() Xres.columns = [self.column_prefix + "_" + c for c in Xres.columns] self._features = getattr(Xres, "columns", None) if self._features is not None: self._features = list(self._features) return Xres
def transform(self, X): check_is_fitted(self, "_all_mapping") if get_type(X) != DataTypes.DataFrame: raise TypeError("This transformer only works for DataFrame") if X.shape[1] != self._nb_columns: raise ValueError("X doesn't have the correct number of columns") all_res = [] for j in range(X.shape[1]): index_line = self._all_mapping[j].loc[X.iloc[:, j]].values index_col = np.arange(len(self._all_mapping[j]) - 1, dtype=np.int32) assert index_col.ndim == 1 assert index_col.ndim == 1 res_j = (index_line[:,np.newaxis] > index_col[np.newaxis,:]).astype(self.dtype) all_res.append(res_j) result = np.concatenate(all_res, axis=1) return pd.DataFrame(result, columns = self.get_feature_names(), index=X.index)
def fit(self, X, y=None, **fit_params): if self.verbose: print("within 'DebugPassThrough' fit named %s" % self.name) if fit_params: print("fit_params given") print(fit_params) if self.debug: self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame): self._expected_columns = list(X.columns) self.fit_params = fit_params # stored, just to help test if self.column_prefix is None: self._features = getattr(X, "columns", None) if self._features is not None: self._features = list(self._features) else: if hasattr(X, "columns"): self._features = [self.column_prefix + "_" + c for c in X.columns] else: self._features = None return self
def test_conversion(): np.random.seed(123) array1 = np.random.randn(10, 3) all_objects = { "a1": (array1, DataTypes.NumpyArray), "a2": (1 * (array1 > 0), DataTypes.NumpyArray), "a3": (array1[:, 1], DataTypes.NumpyArray), "df1": (pd.DataFrame(array1, columns=["A", "B", "C"]), DataTypes.DataFrame), "df2": (pd.DataFrame(1 * (array1 > 0), columns=["a", "b", "c"]), DataTypes.DataFrame), "s1": (sparse.csr_matrix(array1), DataTypes.SparseArray), "s2": (sparse.csr_matrix(1 * (array1 > 0)), DataTypes.SparseArray), # "dfs1":(pd.SparseDataFrame(sparse.csr_matrix(array1),columns=["A","B","C"]) , data_type.SparseDataFrame) # "dfs2":(pd.SparseDataFrame(sparse.csr_matrix(1*(array1 > 0)),columns=["a","b","c"]), data_type.SparseDataFrame) } if _IS_PD1: df1_cat = all_objects["df1"][0].copy() df1_cat["A"] = df1_cat["A"].astype("category") all_objects["df1_cat"] = (df1_cat, DataTypes.DataFrame) for name, (obj, expected_type) in all_objects.items(): assert get_type(obj) == expected_type converted = convert_to_dataframe(obj) assert get_type(converted) == DataTypes.DataFrame converted = convert_to_array(obj) assert get_type(converted) == DataTypes.NumpyArray assert converted.dtype.kind in ("i", "f") converted = convert_to_sparsearray(obj) assert get_type(converted) == DataTypes.SparseArray # converted = convert_to_sparsedataframe(obj) # assert get_type(converted) == DataTypes.SparseDataFrame assert np.array_equal(convert_to_array(all_objects["df1"][0]), all_objects["a1"][0]) assert np.array_equal(convert_to_array(all_objects["s1"][0]), all_objects["a1"][0])
def fit(self, X, y=None): if get_type(X) != DataTypes.DataFrame: raise TypeError("This transformer only works for DataFrame") if X.isnull().sum().sum() > 0: raise ValueError("This transformer doesn't handle null") self._nb_columns = X.shape[1] is_auto = isinstance(self.categories, str) and self.categories == "auto" if not is_auto: if len(self.categories) != X.shape[1]: raise TypeError("categories should be 'auto' or a list the same size as 'X.shape[1]'") all_mappings = [] all_inv_mappings = [] categories = [] for j in range(X.shape[1]): current_category = None if not is_auto: # not automatic try: current_category = self.categories[j] # try to find the names except KeyError: pass if current_category is None: try: current_category = self.categories[X.columns[j]] # try again with name of column except KeyError: pass if current_category is None or is_auto or (isinstance(current_category, str) and current_category == "auto"): target_classes_j = np.sort(np.unique(X.iloc[:, j].values)) else: target_classes_j = np.array(current_category) uy_j = set(list(X.iloc[:, j].values)) if len(set(list(uy_j)).difference(target_classes_j)) > 0: raise ValueError("I have a categories that doesn't exist, please check") integers = np.arange(len(target_classes_j)).astype(self.dtype) mapping = pd.Series(integers, index = target_classes_j) inv_mapping = pd.Series(target_classes_j, index = integers) all_mappings.append(mapping) all_inv_mappings.append(inv_mapping) categories.append(target_classes_j) self.categories_ = categories self._all_mapping = all_mappings self._all_inv_mapping = all_inv_mappings self._input_features = list(X.columns) return self
def transform(self, X): self._check_is_fitted() Xtype = dsh.get_type(X) Xnbcols = dsh._nbcols(X) if self._expected_type != Xtype: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)) if self._expected_nbcols != Xnbcols: raise ValueError( "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols)) if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %d isn't in the column of the DataFrame" % l) return X.iloc[:, self._final_columns_to_use] else: set_col = set(X.columns) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %s isn't in the column of the DataFrame" % l) return X.loc[:, self._final_columns_to_use] else: if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %d isn't in the column of the DataFrame" % l) return X[:, self._final_columns_to_use] else: raise ValueError( "columns_to_use must be integers when type if array or sparseArray" )
def fit(self, X, y=None): Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) self._columns_to_encode = Xcolumns # Force to encode everything now X = get_rid_of_categories(X) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) self.variable_modality_mapping = {col: self.modalities_filter(X[col]) for col in self._columns_to_encode} # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici if self.encoding_type == "num": self._feature_names = self._columns_to_encode self.columns_mapping = {c: [c] for c in self._feature_names} elif self.encoding_type == "dummy": self.columns_mapping = {} index_column = {} self._variable_shift = {} cum_max = 0 for col in self._columns_to_encode: self.columns_mapping[col] = [] for i, (mod, ind) in enumerate(self.variable_modality_mapping[col].items()): index_column[ind + cum_max] = col + "__" + str(mod) self.columns_mapping[col].append(col + "__" + str(mod)) self._variable_shift[col] = cum_max cum_max += i + 1 self._dummy_size = cum_max self._dummy_feature_names = [index_column[i] for i in range(cum_max)] self._feature_names = self._dummy_feature_names else: raise NotImplementedError("I don't know that type of encoding %s" % self.encoding_type) return self
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") X = get_rid_of_categories(X) result = self._transform_aggregat(X, self._target_aggregat, self._target_aggregat_global) assert result.shape[1] == len(self.get_feature_names()) return result
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") X = get_rid_of_categories(X) result = self._transform_to_encode(X) return result
def gen_slice(ob, sl): """ generic column slicer """ t = get_type(ob) if t in (DataTypes.DataFrame, DataTypes.SparseDataFrame): return ob.iloc[:, sl] elif t == DataTypes.SparseArray: if isinstance(ob, scipy.sparse.coo_matrix): ob = scipy.sparse.csc_matrix(ob.copy()) return ob[:, sl] else: return ob[:, sl]
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") result = self._transform_to_encode(X) if len(self._columns_to_keep) > 0: result_other = X.loc[:, self._columns_to_keep] return generic_hstack([result_other, result]) else: return result
def transform(self, X): self._check_is_fitted() Xtype = dsh.get_type(X) Xnbcols = dsh._nbcols(X) if self._expected_type != Xtype: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) if self.raise_if_shape_differs and self._expected_nbcols != Xnbcols: raise ValueError( "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols) ) # TODO : remove that check in some cases if self._return_data_as_inputed: return X # So no copy is made if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %d isn't in the column of the DataFrame" % l) return X.iloc[:, self._final_columns_to_use] else: set_col = set(X.columns) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %s isn't in the column of the DataFrame" % l) return X.loc[:, self._final_columns_to_use] else: if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %d isn't in the column of the DataFrame" % l) if isinstance(X, sps.coo_matrix): return X.tocsc()[:, self._final_columns_to_use].tocoo() # because COO matrix are not subscriptable else: return X[:, self._final_columns_to_use] else: raise ValueError("columns_to_use must be integers when type if array or sparseArray")
def test_generic_hstack_sparse_and_category(with_cat, force_sparse): df = pd.DataFrame({"a":10+np.arange(10),"b":np.random.randn(10)}) if with_cat: df["a"] = df["a"].astype("category") xx = convert_to_sparsearray(np.random.randint(0,1, size=(10,2))) concat = generic_hstack((df,xx), max_number_of_cells_for_non_sparse = 10 + (1-force_sparse) * 1000000) assert concat.shape == (df.shape[0] , df.shape[1] + xx.shape[1]) if force_sparse: assert get_type(concat) == DataTypes.SparseArray elif with_cat: assert concat.dtypes["a"] == "category" assert isinstance(concat, pd.DataFrame)
def transform(self, X): if self._scaler is None or self.components_ is None: raise NotFittedError("You should fit the model first") if get_type(X) != self._input_type: raise TypeError( "X should be a the same type as when fitted : %s, instead I got %s" % (self._input_type, type(X)) ) if X.shape[1] != self._nb_cols: raise ValueError( "X should have the same number of columns has when fitted (%d), instead I got %d" % (self._nb_cols, X.shape[1]) ) Xz = self._scaler.transform(X) Xzk_rot = np.dot(Xz, self.components_) return Xzk_rot
def test_TruncatedSVDWrapper(): df = get_sample_df(100, seed=123) cols = [] for j in range(10): cols.append("num_col_%d" % j) df["num_col_%d" % j] = np.random.randn(df.shape[0]) # 1) regular case : drop other columns svd = TruncatedSVDWrapper(n_components=5, columns_to_use=cols) res1 = svd.fit_transform(df) assert res1.shape == (100, 5) assert get_type(res1) == DataTypes.DataFrame assert list(res1.columns) == ["SVD__%d" % j for j in range(5)] assert not res1.isnull().any().any() assert svd.get_feature_names() == list(res1.columns) # 2) we keep the original columns as well svd = TruncatedSVDWrapper(n_components=5, columns_to_use=cols, drop_used_columns=False, drop_unused_columns=False) res2 = svd.fit_transform(df) assert res2.shape == (100, 5 + df.shape[1]) assert get_type(res2) == DataTypes.DataFrame assert list( res2.columns) == list(df.columns) + ["SVD__%d" % j for j in range(5)] assert svd.get_feature_names() == list( df.columns) + ["SVD__%d" % j for j in range(5)] assert not res2.isnull().any().any() assert (res2.loc[:, list(df.columns)] == df).all().all() # 3) we keep only untouch columns svd = TruncatedSVDWrapper(n_components=5, columns_to_use=cols, drop_used_columns=True, drop_unused_columns=False) res3 = svd.fit_transform(df) assert res3.shape == (100, 3 + 5) assert list(res3.columns) == ["float_col", "int_col", "text_col" ] + ["SVD__%d" % j for j in range(5)] assert svd.get_feature_names() == ["float_col", "int_col", "text_col" ] + ["SVD__%d" % j for j in range(5)] assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] == df.loc[:, ["float_col", "int_col", "text_col"]]).all().all()) ################################### ### same thing but with regex ### ################################### # 1) Regular case : 'drop' other columns svd = TruncatedSVDWrapper(n_components=5, columns_to_use=["num_col_"], regex_match=True) res1 = svd.fit_transform(df) assert res1.shape == (100, 5) assert get_type(res1) == DataTypes.DataFrame assert list(res1.columns) == ["SVD__%d" % j for j in range(5)] assert not res1.isnull().any().any() assert svd.get_feature_names() == list(res1.columns) # 2) Keep original columns svd = TruncatedSVDWrapper( n_components=5, columns_to_use=["num_col_"], drop_used_columns=False, drop_unused_columns=False, regex_match=True, ) res2 = svd.fit_transform(df) assert res2.shape == (100, 5 + df.shape[1]) assert get_type(res2) == DataTypes.DataFrame assert list( res2.columns) == list(df.columns) + ["SVD__%d" % j for j in range(5)] assert svd.get_feature_names() == list( df.columns) + ["SVD__%d" % j for j in range(5)] assert not res2.isnull().any().any() assert (res2.loc[:, list(df.columns)] == df).all().all() # 3) Keep only the un-touch column svd = TruncatedSVDWrapper(n_components=5, columns_to_use=["num_col_"], drop_used_columns=True, drop_unused_columns=False, regex_match=True) res3 = svd.fit_transform(df) assert res3.shape == (100, 3 + 5) assert list(res3.columns) == ["float_col", "int_col", "text_col" ] + ["SVD__%d" % j for j in range(5)] assert svd.get_feature_names() == ["float_col", "int_col", "text_col" ] + ["SVD__%d" % j for j in range(5)] assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] == df.loc[:, ["float_col", "int_col", "text_col"]]).all().all()) # Delta with numpy ### xx = df.values columns_to_use = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] svd = TruncatedSVDWrapper(n_components=5, columns_to_use=columns_to_use, drop_used_columns=True, drop_unused_columns=False) res4 = svd.fit_transform(xx) assert list(res4.columns) == [0, 1, 2] + ["SVD__%d" % i for i in range(5)] assert svd.get_feature_names() == [0, 1, 2 ] + ["SVD__%d" % i for i in range(5)] input_features = ["COL_%d" % i for i in range(xx.shape[1])] assert svd.get_feature_names(input_features) == [ "COL_0", "COL_1", "COL_2" ] + ["SVD__%d" % i for i in range(5)] # Keep svd = TruncatedSVDWrapper(n_components=5, columns_to_use=columns_to_use, drop_used_columns=False, drop_unused_columns=False) res2 = svd.fit_transform(xx) assert list(res2.columns) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ] + ["SVD__%d" % i for i in range(5)] assert svd.get_feature_names() == [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ] + ["SVD__%d" % i for i in range(5)] assert svd.get_feature_names( input_features) == input_features + ["SVD__%d" % i for i in range(5)]
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ### Columns to use ### if self.columns_to_use is None: list_columns_to_use = None # [i for i in range(self._expected_nbcols)] else: list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use) ### Columns to drop ### if self.columns_to_drop is None: list_columns_to_drop = None else: list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop) if list_columns_to_use is not None and len(list_columns_to_use) == 0: raise ValueError("columns_to_use is empty") ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: if is_int: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ###################################### ### Special case : keep everything ### ###################################### self._return_data_as_inputed = False if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None: self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = list(range(X.shape[0])) self._return_data_as_inputed = True if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ### Columns to use ### list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match) list_columns_to_drop = self._get_list_of_columns( columns=self.columns_to_drop, X=X, regex_match=self.regex_match ) ################################# ### Special case : no columns ### ################################# if list_columns_to_use is not None and len(list_columns_to_use) == 0: # This means that there is nothing to do : no columns will be kept self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = [] if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) return self ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: if is_int or is_int is None: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self
def fit(self, X, y): if y is None: raise ValueError("I need a value for 'y'") self._random_gen = check_random_state(self.random_state) Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) if not isinstance(y, pd.Series): sy = pd.Series(y) else: sy = y # Columns to encode and to keep self._columns_to_encode = list(X.columns) X = get_rid_of_categories(X) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) self._columns_to_keep = [] # Verif: if not isinstance(self._columns_to_keep, list): raise TypeError("_columns_to_keep should be a list") for c in self._columns_to_keep: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) # Target information if self.is_regression: self.target_classes = None # No target classes for Regressor self.global_std = np.std(sy) else: # For classification I need to store it self.global_std = None self.target_classes = list(np.unique(sy)) if len(self.target_classes) == 2: self.target_classes = self.target_classes[1:] # Columns on which we want None to be a special modality self._na_to_null = dict() for col in self._columns_to_encode: ii_null = X[col].isnull() self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X) self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None) # Features names self._feature_names = [c for c in self._columns_to_keep] # copy for col in self._columns_to_encode: self._feature_names += self._get_output_column_name(col=col, target_classes=self.target_classes) # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes] return self
def verif_encoder_with_data(klass, enc_kwargs, df1, df2, y1, fit_type, additional_conversion_fun, extended_all_types): """ verification of the behavior of a transform on data """ # Conversion of input into a different type df1_conv = convert_generic(df1, output_type=fit_type) df2_conv = convert_generic(df2, output_type=fit_type) if additional_conversion_fun is not None: df1_conv = additional_conversion_fun(df1_conv) df2_conv = additional_conversion_fun(df2_conv) if y1 is None: encoder = klass(**enc_kwargs) df1_transformed_a = encoder.fit_transform( df1_conv) # 1st test without explicity an y.. df2_transformed_a = encoder.transform(df2_conv) encoder_a = klass(**enc_kwargs) params_0 = encoder_a.get_params() df1_transformed_a = encoder_a.fit_transform( df1_conv, y=y1) # Other test with an y (might be None or not) df2_transformed_a = encoder_a.transform(df2_conv) params_3 = encoder_a.get_params() # Rmk : might no be enforce ON all transformeurs rec_assert_equal(params_0, params_3) # Verif that get_params didn't change after fit assert df1_transformed_a is not None # verify that something was created assert df2_transformed_a is not None # verify that something was created encoder_cloned = clone(encoder_a) # Clone again ... assert_raise_not_fitted( encoder_cloned, df2_conv ) # ... and verify that the clone isn't fitted, even if encoder_a is fitted # Same thing but using ... fit and then... transformed encoder_b = klass(**enc_kwargs) encoder_b.fit(df1_conv, y=y1) df1_transformed_b = encoder_b.transform(df1_conv) df2_transformed_b = encoder_b.transform(df2_conv) assert df1_transformed_b is not None assert df2_transformed_b is not None # Same thing but using clone encoder_c = clone(encoder_a) df1_transformed_c = encoder_c.fit_transform(df1_conv, y=y1) df2_transformed_c = encoder_c.transform(df2_conv) # Samething but using empyt class + set_params encoder_d = klass() encoder_d.set_params(**enc_kwargs) df1_transformed_d = encoder_d.fit_transform(df1_conv, y=y1) df2_transformed_d = encoder_d.transform(df2_conv) # Verif that when passed with the wrong number of columns assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1, None))) for fit_type2, additional_conversion_fun2 in extended_all_types: if fit_type == fit_type2: continue df1_conv2 = convert_generic(df1_conv, output_type=fit_type2) # Verif that is I have a different type that what was present during the fit I'll raise an error assert_raise_value_error(encoder_a, df1_conv2) assert_raise_value_error(encoder_b, df1_conv2) assert_raise_value_error(encoder_c, df1_conv2) assert_raise_value_error(encoder_d, df1_conv2) # Verif shape # Nb of rows ... assert df1_transformed_a.shape[0] == df1_conv.shape[0] assert df1_transformed_b.shape[0] == df1_conv.shape[0] assert df1_transformed_c.shape[0] == df1_conv.shape[0] assert df1_transformed_d.shape[0] == df1_conv.shape[0] assert df2_transformed_a.shape[0] == df2_conv.shape[0] assert df2_transformed_b.shape[0] == df2_conv.shape[0] assert df2_transformed_c.shape[0] == df2_conv.shape[0] assert df2_transformed_d.shape[0] == df2_conv.shape[0] # Nb of columns : all the same assert df1_transformed_b.shape[1] == df1_transformed_a.shape[1] assert df1_transformed_c.shape[1] == df1_transformed_a.shape[1] assert df1_transformed_d.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_a.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_b.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_c.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_d.shape[1] == df1_transformed_a.shape[1] # Verif type assert get_type(df2_transformed_a) == get_type(df1_transformed_a) assert get_type(df1_transformed_b) == get_type(df1_transformed_a) assert get_type(df2_transformed_b) == get_type(df1_transformed_a) assert get_type(df1_transformed_c) == get_type(df1_transformed_a) assert get_type(df2_transformed_c) == get_type(df1_transformed_a) assert get_type(df1_transformed_d) == get_type(df1_transformed_a) assert get_type(df2_transformed_d) == get_type(df1_transformed_a) # if 'desired_output_type' present, check output type is what it seems if "desired_output_type" in enc_kwargs: assert get_type(df1_transformed_a) == enc_kwargs["desired_output_type"] if getattr(encoder_a, "desired_output_type", None) is not None: assert get_type(df1_transformed_a) == encoder_a.desired_output_type # Verif columns if get_type(df1_transformed_b) in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert list(df2_transformed_a.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_b.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_b.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_c.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_c.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_d.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_d.columns) == list( df1_transformed_a.columns) assert encoder_a.get_feature_names() == list(df1_transformed_a.columns) assert encoder_b.get_feature_names() == list(df1_transformed_a.columns) assert encoder_c.get_feature_names() == list(df1_transformed_a.columns) assert encoder_d.get_feature_names() == list(df1_transformed_a.columns) # Verif index if get_type(df1_transformed_b) in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert (df1_transformed_b.index == df1_transformed_a.index).all() assert (df2_transformed_b.index == df2_transformed_a.index).all() assert (df1_transformed_c.index == df1_transformed_a.index).all() assert (df2_transformed_c.index == df2_transformed_a.index).all() assert (df1_transformed_d.index == df1_transformed_a.index).all() assert (df2_transformed_d.index == df2_transformed_a.index).all() if fit_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert (df1_transformed_a.index == df1_conv.index).all() assert (df2_transformed_a.index == df2_conv.index).all()
def test__NumImputer(): xx, xxd, xxs = get_sample_data(add_na=True) xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14]) # DataFrame entry for inp in (_NumImputer(), NumImputer(), _NumImputer(add_is_null=False), NumImputer(add_is_null=False)): xx_out = inp.fit_transform(xxd) assert (xx_out.index == xxd.index).all() assert pd.isnull(xxd.loc[0, "col1"]) # Verify that it is still null assert xx_out.isnull().sum().sum() == 0 assert xx_out["col1"][0] == xxd.loc[~xxd["col1"].isnull(), "col1"].mean() assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xxd) if inp.add_is_null: assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6", "col1_isnull" ] assert xx_out.shape[1] == 1 + xxd.shape[1] assert xx_out["col1_isnull"].iloc[0] == 1 assert xx_out["col1_isnull"].iloc[5] == 1 assert (xx_out["col1_isnull"].iloc[np.array( [1, 2, 3, 4, 6, 7, 8, 9])] == 0).all() else: assert xx_out.shape[1] == xxd.shape[1] assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6" ] inp = _NumImputer(add_is_null=False, allow_unseen_null=False) inp.fit(xxd) xxd2 = xxd.copy() xxd2.iloc[0, 3] = np.nan try: inp.transform(xxd2) raise AssertionError("Model should have fail its transformation") except ValueError: pass input_features = ["COL_%d" % i for i in range(xx.shape[1])] # Numpy array for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xx) assert pd.isnull(xx[0, 1]) assert pd.isnull(xx_out).sum() == 0 assert xx_out.shape[1] == 1 + xx.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xx) assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6", "1_isnull" ] assert inp.get_feature_names( input_features) == input_features + ["COL_1_isnull"] assert xx_out[0, 7] == 1 assert xx_out[5, 7] == 1 assert (xx_out[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all() # Sparse Array for inp in (_NumImputer(), NumImputer()): for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix): xxsf = f(xxs.copy()) xx_out = inp.fit_transform(xxsf) assert pd.isnull(xxs[0, 1]) assert pd.isnull(xx_out.todense()).sum() == 0 assert get_type(xx_out) == get_type(xxs) assert xx_out.shape[1] == 1 + xxs.shape[1] assert xx_out.shape[0] == xx.shape[0] assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6", "1_isnull" ] assert inp.get_feature_names( input_features) == input_features + ["COL_1_isnull"] assert xx_out.todense()[0, 7] == 1 assert xx_out.todense()[0, 7] == 1 assert (xx_out.todense()[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all() xx, xxd, xxs = get_sample_data(add_na=False) xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14]) # DataFrame entry for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xxd) assert (xx_out.index == xxd.index).all() assert xx_out.isnull().sum().sum() == 0 assert xx_out.shape[1] == xxd.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xxd) assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6" ] # Numpy array for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xx) assert pd.isnull(xx_out).sum() == 0 assert xx_out.shape[1] == xx.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xx) assert inp.get_feature_names() == ["0", "1", "2", "3", "4", "5", "6"] assert inp.get_feature_names( input_features=input_features) == input_features # Sparse Array for inp in (_NumImputer(), NumImputer()): for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix): xxs_f = f(xxs.copy()) xx_out = inp.fit_transform(xxs_f) assert pd.isnull(xx_out.todense()).sum() == 0 assert get_type(xx_out) == get_type(xxs) assert xx_out.shape[1] == xxs.shape[1] assert xx_out.shape[0] == xx.shape[0] assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6" ] assert inp.get_feature_names( input_features=input_features) == input_features
def test_PCAWrapper(): df = get_sample_df(100, seed=123) cols = [] for j in range(10): cols.append("num_col_%d" % j) df["num_col_%d" % j] = np.random.randn(df.shape[0]) # 0) n_components > n_features pca = PCAWrapper(n_components=15, columns_to_use=cols) res0 = pca.fit_transform(df) assert res0.shape == (100, len(cols) - 1) assert get_type(res0) == DataTypes.DataFrame assert list(res0.columns) == ["PCA__%d" % j for j in range(len(cols) - 1)] assert not res0.isnull().any().any() assert pca.get_feature_names() == list(res0.columns) # 1) regular case : drop other columns pca = PCAWrapper(n_components=5, columns_to_use=cols) res1 = pca.fit_transform(df) assert res1.shape == (100, 5) assert get_type(res1) == DataTypes.DataFrame assert list(res1.columns) == ["PCA__%d" % j for j in range(5)] assert not res1.isnull().any().any() assert pca.get_feature_names() == list(res1.columns) # 2) we keep the original columns as well pca = PCAWrapper(n_components=5, columns_to_use=cols, keep_other_columns="keep") res2 = pca.fit_transform(df) assert res2.shape == (100, 5 + df.shape[1]) assert get_type(res2) == DataTypes.DataFrame assert list( res2.columns) == list(df.columns) + ["PCA__%d" % j for j in range(5)] assert pca.get_feature_names() == list( df.columns) + ["PCA__%d" % j for j in range(5)] assert not res2.isnull().any().any() assert (res2.loc[:, list(df.columns)] == df).all().all() # 3) Keep only the un-touch column pca = PCAWrapper(n_components=5, columns_to_use=["num_col_"], keep_other_columns="delta", regex_match=True) res3 = pca.fit_transform(df) assert res3.shape == (100, 3 + 5) assert list(res3.columns) == ["float_col", "int_col", "text_col" ] + ["PCA__%d" % j for j in range(5)] assert pca.get_feature_names() == ["float_col", "int_col", "text_col" ] + ["PCA__%d" % j for j in range(5)] assert ((res3.loc[:, ["float_col", "int_col", "text_col"]] == df.loc[:, ["float_col", "int_col", "text_col"]]).all().all()) # Delta with numpy ### xx = df.values columns_to_use = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] pca = PCAWrapper(n_components=5, columns_to_use=columns_to_use, keep_other_columns="delta") res4 = pca.fit_transform(xx) assert list(res4.columns) == [0, 1, 2] + ["PCA__%d" % i for i in range(5)] assert pca.get_feature_names() == [0, 1, 2 ] + ["PCA__%d" % i for i in range(5)] input_features = ["COL_%d" % i for i in range(xx.shape[1])] assert pca.get_feature_names(input_features) == [ "COL_0", "COL_1", "COL_2" ] + ["PCA__%d" % i for i in range(5)] # Keep pca = PCAWrapper(n_components=5, columns_to_use=columns_to_use, keep_other_columns="keep") res2 = pca.fit_transform(xx) assert list(res2.columns) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ] + ["PCA__%d" % i for i in range(5)] assert pca.get_feature_names() == [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ] + ["PCA__%d" % i for i in range(5)] assert pca.get_feature_names( input_features) == input_features + ["PCA__%d" % i for i in range(5)]
def verif_model(df1, df2, y1, klass, model_kwargs, all_types, is_classifier): """ helper function that check (using asserts) a bunch a thing on a model klass Parameters ---------- df1 : array like data on which model will be trained df2 : array like data on which model will be tested klass : type type of the model to test model_kwargs : dict kwargs to be passed to klass to create a model all_types : list of type list of input type to test the models on is_classifier : boolean if True the model is a Classifier otherwise a Regressor """ if not isinstance(all_types, (list, tuple)): all_types = (all_types, ) model0 = klass(**model_kwargs) # Create an object ... model1 = clone(model0) # then try to clone it model2 = klass() # Create an empty object and then set its params model2.set_params(**model_kwargs) # Verify type are iddentical assert type(model0) == type(model1) assert type(model0) == type(model2) assert hasattr(klass, "fit") assert hasattr(klass, "predict") if is_classifier: assert hasattr(klass, "predict_proba") # Verify get_params are identical params_0 = model0.get_params() params_1 = model1.get_params() params_2 = model2.get_params() rec_assert_equal(params_0, params_1) rec_assert_equal(params_0, params_2) rec_assert_equal({k: v for k, v in params_0.items() if k in model_kwargs}, model_kwargs) rec_assert_equal({k: v for k, v in params_1.items() if k in model_kwargs}, model_kwargs) rec_assert_equal({k: v for k, v in params_2.items() if k in model_kwargs}, model_kwargs) extended_all_types = extend_all_type(all_types) if is_classifier: yclasses = list(set(np.unique(y1))) nb_classes = len(yclasses) for fit_type, additional_conversion_fun in extended_all_types: # Convert inputs into several type .. df1_conv = convert_generic(df1, output_type=fit_type) df2_conv = convert_generic(df2, output_type=fit_type) if additional_conversion_fun is not None: df1_conv = additional_conversion_fun(df1_conv) df2_conv = additional_conversion_fun(df2_conv) model_a = klass(**model_kwargs) model_a.fit(df1_conv, y=y1) y1_hat_a = model_a.predict( df1_conv) # Other test with an y (might be None or not) y2_hat_a = model_a.predict(df2_conv) if is_classifier: y1_hatproba_a = model_a.predict_proba(df1_conv) y2_hatproba_a = model_a.predict_proba(df2_conv) params_3 = model_a.get_params( ) # Verif that get_params didn't change after fit # Rmk : might no be enforce ON all transformeurs rec_assert_equal(params_0, params_3) assert y1_hat_a is not None # verify that something was created assert y2_hat_a is not None # verify that something was created model_cloned = clone(model_a) # Clone again ... assert_raise_not_fitted( model_cloned, df2_conv ) # ... and verify that the clone isn't fitted, even if model_a is fitted # Same thing but using clone model_b = clone(model_a) model_b.fit(df1_conv, y=y1) y1_hat_b = model_b.predict(df1_conv) y2_hat_b = model_b.predict(df2_conv) if is_classifier: y1_hatproba_b = model_b.predict_proba(df1_conv) y2_hatproba_b = model_b.predict_proba(df2_conv) # Same thing but with set_params model_c = klass() model_c.set_params(**model_kwargs) model_c.fit(df1_conv, y=y1) y1_hat_c = model_c.predict(df1_conv) y2_hat_c = model_c.predict(df2_conv) if is_classifier: y1_hatproba_c = model_c.predict_proba(df1_conv) y2_hatproba_c = model_c.predict_proba(df2_conv) # check error when call with too few columns assert_raise_value_error(model_a, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(model_b, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(model_c, gen_slice(df1_conv, slice(1, None))) assert y1_hat_a.shape[0] == df1_conv.shape[0] assert y1_hat_b.shape[0] == df1_conv.shape[0] assert y1_hat_c.shape[0] == df1_conv.shape[0] assert y2_hat_a.shape[0] == df2_conv.shape[0] assert y2_hat_b.shape[0] == df2_conv.shape[0] assert y2_hat_c.shape[0] == df2_conv.shape[0] assert y1_hat_a.ndim == y1.ndim assert y1_hat_b.ndim == y1.ndim assert y1_hat_c.ndim == y1.ndim assert y2_hat_a.ndim == y1.ndim assert y2_hat_b.ndim == y1.ndim assert y2_hat_c.ndim == y1.ndim if is_classifier: assert y1_hatproba_a.ndim == 2 assert y1_hatproba_b.ndim == 2 assert y1_hatproba_c.ndim == 2 assert y2_hatproba_a.ndim == 2 assert y2_hatproba_b.ndim == 2 assert y2_hatproba_c.ndim == 2 y1_hatproba_a.shape[1] == nb_classes y1_hatproba_b.shape[1] == nb_classes y1_hatproba_c.shape[1] == nb_classes y2_hatproba_a.shape[1] == nb_classes y2_hatproba_b.shape[1] == nb_classes y2_hatproba_c.shape[1] == nb_classes assert hasattr(model_a, "classes_") assert hasattr(model_b, "classes_") assert hasattr(model_c, "classes_") assert list(set(model_a.classes_)) == list(set(yclasses)) assert list(set(model_b.classes_)) == list(set(yclasses)) assert list(set(model_c.classes_)) == list(set(yclasses)) for f in (check_all_numerical, check_between_01, check_no_null): f(y1_hatproba_a) f(y1_hatproba_b) f(y1_hatproba_c) f(y2_hatproba_a) f(y2_hatproba_b) f(y2_hatproba_c) # Verif type assert get_type(y1_hat_b) == get_type(y1_hat_a) assert get_type(y1_hat_c) == get_type(y1_hat_a) assert get_type(y2_hat_a) == get_type(y1_hat_a) assert get_type(y2_hat_b) == get_type(y1_hat_a) assert get_type(y2_hat_c) == get_type(y1_hat_a)
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self
def fit(self, X, y=None, **fit_params): # * save type self._input_type = get_type(X) self._nb_cols = X.shape[1] NF = X.shape[1] # * scale features self._scaler = StandardScaler( with_mean=self._input_type not in (DataTypes.SparseArray, DataTypes.SparseDataFrame) ) Xz = self._scaler.fit_transform(X) # * random generator random_state = check_random_state(self.random_state) # Number of splits if self.max_nb_groups < 1: high = int(NF * self.max_nb_groups) else: high = min(int(self.max_nb_groups), NF - 1) self._nb_of_groups = random_state.randint(low=1, high=high) # all splits if self.max_group_size < 1: high_f = max(int(NF * self.max_group_size), 5) else: high_f = min(int(self.max_group_size), NF) FK = np.zeros((self._nb_of_groups, NF)) for k in range(self._nb_of_groups): num_features = random_state.randint(1, high_f) rp = np.random.permutation(NF) FK[k, rp[0:num_features]] = 1 components_ = np.zeros((NF, NF), dtype=Xz.dtype) n_samples = Xz.shape[0] for k in range(self._nb_of_groups): pos = np.nonzero(FK[k, :])[0] Xzk = Xz[:, pos] # TODO : subsample of class pca = PCA(n_components=len(pos), whiten=False, copy=True, random_state=self.random_state) if self.bootstrap: while True: ii_to_keep = ( random_state.randn(n_samples) <= 0.63 ) # boostrap probability that an index is in a bootstrap sample (limit N -> inf) index_to_keep = np.where(ii_to_keep)[0] if len(index_to_keep) > 0: # To prevent the (very unlickely) case where nothing is selected...) break Xzk_bootstrap = Xzk[index_to_keep, :] else: Xzk_bootstrap = Xzk pca.fit(Xzk_bootstrap) rot = pca.components_.T assert rot.shape[0] == len(pos) if rot.shape[1] < len(pos): rot = np.hstack((rot, np.zeros((rot.shape[0], len(pos) - rot.shape[1]), dtype=rot.dtype))) assert rot.shape[0] == rot.shape[1] assert rot.shape[0] == len(pos) components_[pos.reshape(len(pos), 1), pos.reshape(1, len(pos))] = rot features_to_keep = np.any(components_ != 0, axis=0) self.components_ = components_[:, features_to_keep].astype(Xz.dtype) self._feature_names = ["RPCA_%d" % i for i in range(self.components_.shape[1])] return self