def test_make1dimension(): df = pd.DataFrame({"a": np.arange(10)}) assert make1dimension(df).shape == (10,) assert make1dimension(df["a"]).shape == (10,) assert make1dimension(df.values).shape == (10,) assert make1dimension(df["a"].values).shape == (10,) df = pd.DataFrame({"a": np.arange(10), "b": ["aa", "bb", "cc"] * 3 + ["dd"]}) with pytest.raises(ValueError): make1dimension(df) # Can't convert to one dimension if 2 columnx with pytest.raises(ValueError): make1dimension(df.values) # Can't convert to one dimension if 2 columnx
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self