def test_diff(): list1 = [1, 2, 3] list2 = [3, 4, 5] assert diff(list1, list2) == [1, 2] assert diff(list2, list1) == [4, 5] assert diff(list1, []) == list1 list1 = ["a", "b", "c"] list2 = ["d", "c", "e"] assert diff(list1, list2) == ["a", "b"] assert diff(list2, list1) == ["d", "e"] assert diff(list1, []) == list1 assert isinstance(diff((1, 2, 3), (1, 2)), tuple)
def test_FeaturesSelectorClassifier_get_feature_names(): vect = CountVectorizer(analyzer="char", ngram_range=(1, 3)) df = get_sample_df(100, seed=123) xx = vect.fit_transform(df["text_col"]) y = 1 * (np.random.rand(xx.shape[0]) > 0.5) sel = FeaturesSelectorClassifier(n_components=10) sel.fit_transform(xx, y) ff0 = vect.get_feature_names() ff1 = sel.get_feature_names() assert len(diff(ff1, list(range(xx.shape[1])))) == 0 ff2 = sel.get_feature_names(input_features=ff0) assert len(ff1) == len(ff2) for f1, f2 in zip(ff1, ff2): assert ff0[f1] == f2
def test_intersect(): list1 = [1, 2, 3] list2 = [3, 4, 5] assert intersect(list1, list2) == [3] assert intersect(list2, list1) == [3] list1 = [1, 2, 3, 4] list2 = [4, 3, 5, 6] assert intersect(list1, list2) == [3, 4] assert intersect(list2, list1) == [4, 3] assert intersect(list1, []) == [] list1 = ["a", "b", "c"] list2 = ["d", "c", "e"] assert intersect(list1, list2) == ["c"] assert intersect(list2, list1) == ["c"] assert intersect(list1, []) == [] assert isinstance(diff((1, 2, 3), (1, 2)), tuple)
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ###################################### ### Special case : keep everything ### ###################################### self._return_data_as_inputed = False if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None: self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = list(range(X.shape[0])) self._return_data_as_inputed = True if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ### Columns to use ### list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match) list_columns_to_drop = self._get_list_of_columns( columns=self.columns_to_drop, X=X, regex_match=self.regex_match ) ################################# ### Special case : no columns ### ################################# if list_columns_to_use is not None and len(list_columns_to_use) == 0: # This means that there is nothing to do : no columns will be kept self._already_fitted = True self._columns_to_use_is_integer = True self._final_columns_to_use = [] if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) return self ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: # TODO : allow a compiled regex cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: if is_int or is_int is None: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: final_columns_to_use = [] else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self
def fit_metric_model(self): logger.info("start computing metric model...") ### Load the results df_results = self.result_reader.load_all_results(aggregate=True) self._nb_models_done = len(df_results) if self._nb_models_done <= self.min_nb_of_models: return self if (self._nb_models_done is not None and len(df_results) == self._nb_models_done and self.params_training_columns is not None): return self ### Load the params df_params = self.result_reader.load_all_params() df_merged_result = pd.merge(df_params, df_results, how="inner", on="job_id") training_cols = diff(list(df_params.columns), ["job_id"]) # X dataframe for parameters dfX_params = df_merged_result.loc[:, training_cols] ### Retrive the target metric if self.avg_metrics: scorers = self.job_config.scoring else: scorers = [self.job_config.main_scorer ] # I'll use only the main_scorer N = dfX_params.shape[0] all_y_params = [] for scorer in scorers: y_params = df_merged_result["test_%s" % scorer] # Retrive the raw metric # replace NaN by scorer's observed minimum score ; if y_params contains # only NaN -> won't work y_params = y_params.fillna(y_params.min()).values if self.metric_transformation is None: pass elif self.metric_transformation == "rank": ### Transform in non-parametric rank .... y_params = kde_transfo_quantile(y_params) # => This behave likes a uniform law elif self.metric_transformation == "normal": ### Transform into non-parametric normal ... y_params = norm.ppf(kde_transfo_quantile(y_params)) # => This behaves likes a normal law elif self.metric_transformation == "default": ### Transform using default transformation (log like function) try: f = get_metric_default_transformation(scorer) except ValueError: logger.info( "I don't know how to transform this metric %s, I'll use default normal transformation" % str(scorer)) f = None if f is None: y_params = norm.ppf(kde_transfo_quantile(y_params)) else: y_params = f(y_params) if self.avg_metrics: # If I'm averaging I'd rather have something centered y_params = (y_params - np.mean(y_params)) / np.std(y_params) else: raise ValueError("I don't know this metric_transformation %s" % self.metric_transformation) all_y_params.append(y_params.reshape((N, 1))) if len(all_y_params) > 1: y_params = np.concatenate(all_y_params, axis=1).mean(axis=1) else: y_params = all_y_params[0].reshape((N, )) # elif self.metric_transformation # # # else: # # On peut aussi utiliser la transformation par default ? # scorer = self.job_config.main_scorer # y_params = df_merged_result["test_%s" % scorer].values # # create model transformer_model = GraphPipeline(models={ "encoder": NumericalEncoder(), "imputer": NumImputer() }, edges=[("encoder", "imputer")]) xx_params = transformer_model.fit_transform(dfX_params) random_forest = RandomForestRegressor(n_estimators=100, min_samples_leaf=5) random_forest.fit(xx_params, y_params) random_forest_variance = RandomForestVariance(random_forest) random_forest_variance.fit(xx_params, y_params) self.params_training_columns = training_cols self.transformer_model = transformer_model self.random_forest = random_forest self.random_forest_variance = random_forest_variance self._nb_models_done = len(df_results) logger.info("metric model fitted") return self
def create_graphical_representation(steps): """ from a an OrderedDict of steps create a Graphical reprensetation of the model we'll use """ # Rmk : il faut a priori, mettre les numero de l'etape dans le graph # + mettre les labels correct # comme ça on pourra avoir plusieurs noeud avec le meme nom (Ex : Scaler...) ### 1) Split Composion Steps vs Rest all_composition_steps = [] all_others = [] for (step_name, model_name), var_type in steps.items(): if StepCategories.is_composition_step(step_name): all_composition_steps.append((step_name, model_name, var_type)) else: all_others.append((step_name, model_name, var_type)) ### 2) Create Graph for non-composition step new_steps = OrderedDict() G = nx.DiGraph() for step_name, model_name, var_type in all_others: # for name,var_type in steps.items(): unested_var_type = unnest_tuple(var_type) terminal_nodes = gh.get_terminal_nodes( G ) # Terminal links : I'll add the new step on one (or more) of those ending_node_type = { unnest_tuple(steps[node]): node for node in terminal_nodes } node_name = (step_name, model_name) # 2-uple if node_name in G.nodes: raise ValueError("This node already exists '(%s,%s)'" % node_name) # 1) Soit je rattache le nouveau a UN noeud terminal # 2) Soit je cree une nouvelle branche (nouveau noeud ratacher a rien) # 3) Soit je rattache a PLUSIEURS noeud terminaux elif unested_var_type in ending_node_type: ### 1) I already have a branch of this type last_node = ending_node_type[unested_var_type] G = gh.add_node_after(G, node_name, last_node) ### I don't have a branch ### else: all_candidates = [(t, n) for t, n in ending_node_type.items() if tuple_include(t, unested_var_type)] # I need to look where I want to plug it # if len(all_candidates) == 0: ### 2) Je dois creer une nouvelle branche : aucun noeud ### G = gh.add_node_after(G, node_name) else: ### 3) Je rattache a plusieurs noeuds ### Ici : il faut parfois rajouter un noeud en AMONT, si on a des types qui n'ont pas ete rajouter types_added = unnest_tuple([t for t, n in all_candidates]) types_not_added = diff(unested_var_type, types_added) if len(types_not_added) > 0: name_of_cat = "Selector_%s" % unnest_tuple(types_not_added) new_node = (name_of_cat, (name_of_cat, SpecialModels.ColumnsSelector)) G = gh.add_node_after(G, new_node) new_steps[ new_node] = types_not_added # I also must dynamically add the node to the list of steps all_candidates = all_candidates + [ (types_not_added, new_node) ] G = gh.add_node_after(G, node_name, *[n for t, n in all_candidates]) ### 3) Include composition node on top for step_name, model_name, _ in reversed(all_composition_steps): starting_nodes = gh.get_starting_nodes(G) for n in starting_nodes: G.add_edge((step_name, model_name), n) ### 4) Verify the Graph structure for (step_name, model_name), _ in steps.items(): if (step_name, model_name) not in G: raise ValueError("'(%s , %s)' should be in graph" % (step_name, model_name)) # all nodes were in the steps for node in G.nodes(): if node not in steps and node not in new_steps: raise ValueError("'(%s,%s)' shouldn't be in graph" % node) assert_model_graph_structure(G) return G, new_steps
def fit(self, X, y): if y is None: raise ValueError("I need a value for 'y'") self._random_gen = check_random_state(self.random_state) Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) if not isinstance(y, pd.Series): sy = pd.Series(y) else: sy = y # Columns to encode and to keep if self.columns_to_encode is None: self._columns_to_encode = self.guess_columns_to_encode(X) elif isinstance(self.columns_to_encode, str) and self.columns_to_encode == "--object--": self._columns_to_encode = list(X.columns[X.dtypes == "object"]) else: self._columns_to_encode = list(self.columns_to_encode) X = get_rid_of_categories(X) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) if self.columns_to_keep is None: self._columns_to_keep = diff(Xcolumns, self._columns_to_encode) else: self._columns_to_keep = list(self.columns_to_keep) # Verif: if not isinstance(self._columns_to_keep, list): raise TypeError("_columns_to_keep should be a list") for c in self._columns_to_keep: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) # Target information if self.is_regression: self.target_classes = None # No target classes for Regressor self.global_std = np.std(sy) else: # For classification I need to store it self.global_std = None self.target_classes = list(np.unique(sy)) if len(self.target_classes) == 2: self.target_classes = self.target_classes[1:] # Columns on which we want None to be a special modality self._na_to_null = dict() for col in self._columns_to_encode: ii_null = X[col].isnull() self._na_to_null[col] = ii_null.sum( ) >= self.max_na_percentage * len(X) self._target_aggregat, self._target_aggregat_global = self._fit_aggregat( X, sy, noise_level=None) # Features names self._feature_names = [c for c in self._columns_to_keep] # copy for col in self._columns_to_encode: self._feature_names += self._get_output_column_name( col=col, target_classes=self.target_classes) # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes] return self
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()
def fit(self, X, y=None): self._expected_type = dsh.get_type(X) self._expected_nbcols = dsh._nbcols(X) ### Columns to use ### if self.columns_to_use is None: list_columns_to_use = None # [i for i in range(self._expected_nbcols)] else: list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use) ### Columns to drop ### if self.columns_to_drop is None: list_columns_to_drop = None else: list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop) if list_columns_to_use is not None and len(list_columns_to_use) == 0: raise ValueError("columns_to_use is empty") ### What is the type of columns_to_use and columns_to_drop : if list_columns_to_use is not None: is_int = "int" in str(type(list_columns_to_use[0])) else: is_int = None if list_columns_to_drop is not None and len(list_columns_to_drop) > 0: is_int_to_drop = "int" in str(type(list_columns_to_drop[0])) else: is_int_to_drop = is_int ### Verify type: if is_int is not None and is_int_to_drop is not None: if is_int != is_int_to_drop: raise ValueError( "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type" ) if is_int is None and is_int_to_drop is None: is_int = True is_int_to_drop = True if is_int is None and is_int_to_drop is not None: is_int = is_int_to_drop if is_int_to_drop is None and is_int is not None: is_int_to_drop = is_int if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): if is_int: ############################################## ### Case 1 : DataFrame + Integer selection ### ############################################## if self.regex_match: ####################### ## Case 1a : + Regex ## ####################### raise ValueError("regex_match can only work with strings 'columns_to_use' not int") cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: # Check all column are available for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # final_columns_to_use = intersect( list_columns_to_use , list(range(self._expected_nbcols)) ) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ############################################# ### Case 2 : DataFrame + String selection ### ############################################# if self.regex_match: ####################### ## Case 2a : + Regex ## ####################### if list_columns_to_use is not None: cols_that_match = [] for col in list(X.columns): for r in list_columns_to_use: if re.search(r, col) is not None: cols_that_match.append(col) break if list_columns_to_drop is not None: cols_that_match_drop = [] for col in list(X.columns): for r in list_columns_to_drop: if re.search(r, col) is not None: cols_that_match_drop.append(col) break if list_columns_to_use is not None: final_columns_to_use = cols_that_match # final_columns_to_use = intersect(cols_that_match , list(X.columns)) # technically the intersect is useless else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop) else: ######################## ## Case 2b : no Regex ## ######################## cols_set = set(X.columns) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = list_columns_to_use # intersect(list_columns_to_use, list(X.columns)) else: final_columns_to_use = list(X.columns) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %s isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: if is_int: ########################################## ### Case 3 : Array + Integer selection ### ########################################## if self.regex_match: ######################## ## Case 3a : + Regex ## ######################## raise ValueError("regex_match can only work with strings 'columns_to_use' not int") ######################## ## Case 3b : no Regex ## ######################## cols_set = set(range(self._expected_nbcols)) if list_columns_to_use is not None: for l in list_columns_to_use: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols))) else: final_columns_to_use = list(range(self._expected_nbcols)) if list_columns_to_drop is not None: for l in list_columns_to_drop: if l not in cols_set: raise ValueError("Column %d isn't in the columns of the DataFrame" % l) final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop) else: ######################################### ### Case 4 : Array + String selection ### ######################################### raise ValueError("columns_to_use must be integers when type is array or sparseArray") self._columns_to_use_is_integer = is_int self._final_columns_to_use = final_columns_to_use if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): self._Xcolumns = list(X.columns) else: self._Xcolumns = list(range(self._expected_nbcols)) ## TODO : here make a simplification into a slice when it is possible self._already_fitted = True return self
def fit(self, X, y=None): Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) # Columns to encode and to keep if self.columns_to_encode is None: self._columns_to_encode = self.guess_columns_to_encode(X) elif isinstance(self.columns_to_encode, str) and self.columns_to_encode == "--object--": self._columns_to_encode = list(X.columns[X.dtypes == "object"]) else: self._columns_to_encode = list(self.columns_to_encode) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) if self.columns_to_keep is None: self._columns_to_keep = diff(Xcolumns, self._columns_to_encode) else: self._columns_to_keep = list(self.columns_to_keep) # Verif: if not isinstance(self._columns_to_keep, list): raise TypeError("_columns_to_keep should be a list") for c in self._columns_to_keep: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) self.variable_modality_mapping = { col: self.modalities_filter(X[col]) for col in self._columns_to_encode } # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici if self.encoding_type == "num": self._feature_names = self._columns_to_keep + self._columns_to_encode self.columns_mapping = {c: [c] for c in self._feature_names} elif self.encoding_type == "dummy": self.columns_mapping = {c: [c] for c in self._columns_to_keep} index_column = {} self._variable_shift = {} cum_max = 0 for col in self._columns_to_encode: self.columns_mapping[col] = [] for i, (mod, ind) in enumerate( self.variable_modality_mapping[col].items()): index_column[ind + cum_max] = col + "__" + str(mod) self.columns_mapping[col].append(col + "__" + str(mod)) self._variable_shift[col] = cum_max cum_max += i + 1 self._dummy_size = cum_max self._dummy_feature_names = [ index_column[i] for i in range(cum_max) ] self._feature_names = self._columns_to_keep + self._dummy_feature_names else: raise NotImplementedError("I don't know that type of encoding %s" % self.encoding_type) return self