Exemple #1
0
    def fit(self, X, y=None):

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        Xcolumns = list(X.columns)

        self._columns_to_encode = Xcolumns  # Force to encode everything now

        X = get_rid_of_categories(X)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        self.variable_modality_mapping = {col: self.modalities_filter(X[col]) for col in self._columns_to_encode}

        # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici

        if self.encoding_type == "num":
            self._feature_names = self._columns_to_encode

            self.columns_mapping = {c: [c] for c in self._feature_names}

        elif self.encoding_type == "dummy":

            self.columns_mapping = {}

            index_column = {}
            self._variable_shift = {}
            cum_max = 0
            for col in self._columns_to_encode:

                self.columns_mapping[col] = []

                for i, (mod, ind) in enumerate(self.variable_modality_mapping[col].items()):
                    index_column[ind + cum_max] = col + "__" + str(mod)

                    self.columns_mapping[col].append(col + "__" + str(mod))

                self._variable_shift[col] = cum_max
                cum_max += i + 1

            self._dummy_size = cum_max
            self._dummy_feature_names = [index_column[i] for i in range(cum_max)]
            self._feature_names = self._dummy_feature_names

        else:
            raise NotImplementedError("I don't know that type of encoding %s" % self.encoding_type)

        return self
Exemple #2
0
    def transform(self, X):

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")
        X = get_rid_of_categories(X)

        result = self._transform_aggregat(X, self._target_aggregat, self._target_aggregat_global)
        assert result.shape[1] == len(self.get_feature_names())

        return result
Exemple #3
0
    def transform(self, X):

        if get_type(X) != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        X = get_rid_of_categories(X)

        result = self._transform_to_encode(X)

        return result
Exemple #4
0
def test_get_rid_of_categories():
    df = get_sample_df()
    df2 = get_rid_of_categories(df)

    assert df2 is df  # nothing happend

    df_with_cat = df.copy()
    df_with_cat["text_col"] = df_with_cat["text_col"].astype("category")

    assert (df_with_cat.dtypes == "category").any()  # category
    df2 = get_rid_of_categories(df_with_cat)

    assert not (df2.dtypes == "category").any()  # no more category
    assert df2["text_col"].dtype == "object"
    assert (df2["text_col"] == df_with_cat["text_col"]).all()

    df_with_cat = df.copy()
    df_with_cat["int_col"] = df_with_cat["int_col"].astype("category")

    df2 = get_rid_of_categories(df_with_cat)

    assert not (df2.dtypes == "category").any()  # no more category
    assert (df2.dtypes == df.dtypes).all()
Exemple #5
0
    def fit_transform(self, X, y):

        if y is None:
            raise ValueError("I need a value for 'y'")

        if not isinstance(y, pd.Series):
            sy = pd.Series(y)
        else:
            sy = y

        self.fit(X, sy)

        X = get_rid_of_categories(X)

        if self.cv is None:  # No Cross Validation ...
            target_aggregat, target_aggregat_global = self._fit_aggregat(
                X, y, noise_level=self.noise_level)
            all_results = self._transform_aggregat(X, target_aggregat,
                                                   target_aggregat_global)

        else:
            cv = create_cv(self.cv,
                           y=sy,
                           classifier=not self.is_regression,
                           random_state=123)

            all_results = []
            for train, test in cv.split(X, y):
                target_aggregat, target_aggregat_global = self._fit_aggregat(
                    X.iloc[train, :],
                    sy.iloc[train],
                    noise_level=self.noise_level)

                sub_result = self._transform_aggregat(X.iloc[test, :],
                                                      target_aggregat,
                                                      target_aggregat_global)

                all_results.append(sub_result)

            all_results = pd.concat(all_results, axis=0)
            all_results = all_results.loc[X.index, :]

            assert len(all_results) == len(X)
            assert (all_results.index == X.index).all()
            assert all_results.shape[1] == len(self.get_feature_names())

        return all_results
Exemple #6
0
    def fit(self, X, y):

        if y is None:
            raise ValueError("I need a value for 'y'")

        self._random_gen = check_random_state(self.random_state)

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")
        Xcolumns = list(X.columns)

        if not isinstance(y, pd.Series):
            sy = pd.Series(y)
        else:
            sy = y

        # Columns to encode and to keep

        self._columns_to_encode = list(X.columns)

        X = get_rid_of_categories(X)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        self._columns_to_keep = []

        # Verif:
        if not isinstance(self._columns_to_keep, list):
            raise TypeError("_columns_to_keep should be a list")

        for c in self._columns_to_keep:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        # Target information
        if self.is_regression:

            self.target_classes = None  # No target classes for Regressor
            self.global_std = np.std(sy)

        else:
            # For classification I need to store it
            self.global_std = None
            self.target_classes = list(np.unique(sy))

            if len(self.target_classes) == 2:
                self.target_classes = self.target_classes[1:]

        # Columns on which we want None to be a special modality
        self._na_to_null = dict()
        for col in self._columns_to_encode:
            ii_null = X[col].isnull()
            self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X)

        self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None)

        # Features names
        self._feature_names = [c for c in self._columns_to_keep]  # copy
        for col in self._columns_to_encode:
            self._feature_names += self._get_output_column_name(col=col, target_classes=self.target_classes)
            # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes]

        return self