def predict(self, X): y_hat = self.regressor.predict(X) # call regressor y_int_hat = (y_hat + 0.5).astype(np.int32) # conversion to closest int y_hat = self._target_encoder.inverse_transform( make2dimensions(y_int_hat)) if self._mono_target: y_hat = y_hat[:, 0] return y_hat.astype(self._target_dtype)
def _prepare_target(self, y, klass, conversion_type): """ prepare the target so that it can be given to the underlying model to use Parameters ---------- y : array the original target klass : type the encoder to use for the target conversion_type : DataType the output type desired by the target Set --- self._mono_target : bool does the original problem as one target or not self._target_encoded : the encoder used on the target Returns -------- y_encoded : array the modified target """ self._mono_target = y.ndim == 1 self._target_dtype = y.dtype if isinstance(self.classes, str) and self.classes == "auto": categories = "auto" else: if self._mono_target: categories = [self.classes ] # because OrdinalEncoder expect a list else: if not isinstance(self.classes, list): raise TypeError( "For multi-target classes should be a list, instead I got %s" % str(type(self.classes))) categories = self.classes self._target_encoder = klass(categories=categories, dtype=np.int32) yd2 = convert_generic(make2dimensions(y), output_type=conversion_type) if conversion_type == DataTypes.NumpyArray and yd2.dtype.kind == 'U': yd2 = yd2.astype(np.object, copy=False) y_encoded = self._target_encoder.fit_transform(yd2) return y_encoded
def fit(self, X, y): self._mono_target = y.ndim == 1 if self.y_clusterer is None: y_clusterer = self.get_default_y_cluster(y) else: y_clusterer = self.y_clusterer # TODO : check that it is a clusterer if not is_classifier(self.classifier): raise TypeError("classifier should be a classifer") yd2 = make2dimensions(y) if hasattr(y_clusterer, "fit_predict"): y_cl = y_clusterer.fit_predict(yd2) else: y_cl = y_clusterer.fit_transform(yd2).astype('int32') if y_cl.ndim == 1: y_cl = y_cl[:, np.newaxis] if self._mono_target and y_cl.shape[1] > 1: raise ValueError( "The cluster should return only 1 dimensional clusters") self._mono_cluster = y_cl.shape[1] == 1 self.classifier.fit(X, y_cl) # fit classifier on result of cluster if self._mono_cluster: classes = [self.classifier.classes_] else: classes = self.classifier.classes_ all_mean_mapping = self._compute_y_mean(yd2, y_cl) all_y_mean_mapping_matrix = [] for classe, y_mean_mapping in zip(classes, all_mean_mapping): mat = np.concatenate([y_mean_mapping[cl] for cl in classe], axis=0) all_y_mean_mapping_matrix.append(mat) self._all_y_mean_matrix = all_y_mean_mapping_matrix return self
def test_make2dimensions(): df = pd.DataFrame({"a": np.arange(10), "b": ["aa", "bb", "cc"] * 3 + ["dd"]}) df2 = make2dimensions(df) assert id(df2) == id(df) assert df2.shape == (10, 2) assert make2dimensions(df["a"]).shape == (10, 1) assert make2dimensions(df.values).shape == (10, 2) assert make2dimensions(df["a"].values).shape == (10, 1) xx = np.zeros((10, 2, 2)) with pytest.raises(ValueError): make2dimensions(xx)
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self