def test__nbcols(): df = pd.DataFrame({ "a": np.arange(10), "b": ["aa", "bb", "cc"] * 3 + ["dd"] }) assert _nbcols(df) == 2 assert _nbcols(df.values) == 2 assert _nbcols(df["a"]) == 1 assert _nbcols(df["a"].values) == 1
def fit_transform(self, X, y=None, **fit_params): if self.verbose: print("withing 'DebugPassThrought' fit_transform named %s" % self.name) if fit_params: print("fit_params given") print(fit_params) if self.debug: self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame): self._expected_columns = list(X.columns) self.fit_params = fit_params # stored, just to help test Xres = X if self.column_prefix is not None: Xres = X.copy() Xres.columns = [self.column_prefix + "_" + c for c in Xres.columns] self._features = getattr(Xres, "columns", None) if self._features is not None: self._features = list(self._features) return Xres
def fit(self, X, y=None, **fit_params): if self.verbose: print("within 'DebugPassThrough' fit named %s" % self.name) if fit_params: print("fit_params given") print(fit_params) if self.debug: self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) if self._expected_type in (dsh.DataTypes.DataFrame, dsh.DataTypes.SparseDataFrame): self._expected_columns = list(X.columns) self.fit_params = fit_params # stored, just to help test if self.column_prefix is None: self._features = getattr(X, "columns", None) if self._features is not None: self._features = list(self._features) else: if hasattr(X, "columns"): self._features = [self.column_prefix + "_" + c for c in X.columns] else: self._features = None return self
def transform(self, X): self._check_is_fitted() Xtype = dsh.get_type(X) Xnbcols = dsh._nbcols(X) if self._expected_type != Xtype: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)) if self._expected_nbcols != Xnbcols: raise ValueError( "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols)) if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %d isn't in the column of the DataFrame" % l) return X.iloc[:, self._final_columns_to_use] else: set_col = set(X.columns) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %s isn't in the column of the DataFrame" % l) return X.loc[:, self._final_columns_to_use] else: if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError( "Column %d isn't in the column of the DataFrame" % l) return X[:, self._final_columns_to_use] else: raise ValueError( "columns_to_use must be integers when type if array or sparseArray" )
def transform(self, X): self._check_is_fitted() Xtype = dsh.get_type(X) Xnbcols = dsh._nbcols(X) if self._expected_type != Xtype: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) if self.raise_if_shape_differs and self._expected_nbcols != Xnbcols: raise ValueError( "I don't have the correct number of columns, expected : %d, got : %d" % (self._expected_nbcols, Xnbcols) ) # TODO : remove that check in some cases if self._return_data_as_inputed: return X # So no copy is made if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %d isn't in the column of the DataFrame" % l) return X.iloc[:, self._final_columns_to_use] else: set_col = set(X.columns) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %s isn't in the column of the DataFrame" % l) return X.loc[:, self._final_columns_to_use] else: if self._columns_to_use_is_integer: set_col = set(range(X.shape[1])) for l in self._final_columns_to_use: if l not in set_col: raise ValueError("Column %d isn't in the column of the DataFrame" % l) if isinstance(X, sps.coo_matrix): return X.tocsc()[:, self._final_columns_to_use].tocoo() # because COO matrix are not subscriptable else: return X[:, self._final_columns_to_use] else: raise ValueError("columns_to_use must be integers when type if array or sparseArray")
def _verif(self): if not isinstance(self.all_datas, (list, dict)): raise TypeError( "I don't know how to handle that type of Data : %s" % type(self.all_datas)) if hasattr(self.all_datas, "items"): nbrows = [_nbrows(data) for key, data in self.all_datas.items()] nbcols = [_nbcols(data) for key, data in self.all_datas.items()] self._is_dict = True else: nbrows = [_nbrows(data) for data in self.all_datas] nbcols = [_nbcols(data) for data in self.all_datas] self._is_dict = False if len(set(nbrows)) > 1: raise ValueError("All objects don't have the same length") self._nbrows = nbrows[0] self._nbcols = sum(nbcols)
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ###################################### ### Special case : keep everything ### ###################################### self._return_data_as_inputed = False if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None: self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = list(range(X.shape[0])) self._return_data_as_inputed = True if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ### Columns to use ### list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match) list_columns_to_drop = self._get_list_of_columns( columns=self.columns_to_drop, X=X, regex_match=self.regex_match ) ################################# ### Special case : no columns ### ################################# if list_columns_to_use is not None and len(list_columns_to_use) == 0: # This means that there is nothing to do : no columns will be kept self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = [] if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) return self ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: if is_int or is_int is None: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ### Columns to use ### if self.columns_to_use is None: list_columns_to_use = None # [i for i in range(self._expected_nbcols)] else: list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use) ### Columns to drop ### if self.columns_to_drop is None: list_columns_to_drop = None else: list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop) if list_columns_to_use is not None and len(list_columns_to_use) == 0: raise ValueError("columns_to_use is empty") ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: if is_int: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self