def label_encoder(self): """ Takes the output_df and label encode any features in lbl_enc_feats list """ # Loop through each feature in lbl_enc_feats and label encode it for feat in self.lbl_enc_feats: le = LabelEncoder() le.fit(self.output_df[feat]) self.output_df[feat] = le.transform(self.output_df[feat])
def fit(self, X, y=None): """ Fit OneHotEncoder to X. Parameters ---------- X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) The data to determine the categories of each feature. y : None Ignored. This parameter exists for compatibility only. Returns ------- self """ self._validate_keywords() X = self._check_input_fit(X) if type(self.categories) is str and self.categories == 'auto': self._features = X.columns self._encoders = { feature: LabelEncoder(handle=self.handle, verbose=self.verbose, output_type=self.output_type, handle_unknown=self.handle_unknown).fit( self._unique(X[feature])) for feature in self._features } else: self.categories = self._check_input_fit(self.categories, True) self._features = self.categories.columns if len(self._features) != X.shape[1]: raise ValueError("Shape mismatch: if categories is not 'auto'," " it has to be of shape (n_features, _).") self._encoders = dict() for feature in self._features: le = LabelEncoder(handle=self.handle, verbose=self.verbose, output_type=self.output_type, handle_unknown=self.handle_unknown) self._encoders[feature] = le.fit(self.categories[feature]) if self.handle_unknown == 'error': if self._has_unknown(X[feature], self._encoders[feature].classes_): msg = ("Found unknown categories in column {0}" " during fit".format(feature)) raise KeyError(msg) self.drop_idx_ = self._compute_drop_idx() self._fitted = True return self
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("combi cats"): new_cat_df = cudf.concat( [ xfeat.ConcatCombination( drop_origin=True, r=r).fit_transform( total[cat_cols].astype(str).fillna("none")) for r in [2, 3, 4] ], axis="columns", ) for col in new_cat_df.columns: le = LabelEncoder() new_cat_df[col] = le.fit_transform( new_cat_df[col]).astype("category") total = cudf.concat( [total, new_cat_df], axis="columns", ) with timer("end"): total = total.sort_values("index") new_cols = [ col for col in total.columns if col not in org_cols + ["index"] ] self.train = total[new_cols].iloc[:len_train].reset_index( drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
def _train_test_split_with_object(X, y, shuffle=True, random_state=None, stratify=None, **kwargs): """ cuml.train_test_split raise exception if y.dtype=='object', so we encode it """ le = LabelEncoder() yt = le.fit_transform(y) if stratify is y: stratify = yt elif stratify is not None and str(stratify.dtype) == 'object': stratify = LabelEncoder().fit_transform(stratify) X_train, X_test, y_train, y_test = \ train_test_split(X, yt, shuffle=shuffle, random_state=random_state, stratify=stratify, **kwargs) y_train_decoded = le.inverse_transform(y_train) y_test_decoded = le.inverse_transform(y_test) y_train_decoded.index = y_train.index y_test_decoded.index = y_test.index return X_train, X_test, y_train_decoded, y_test_decoded
def create_features( self, train_df: cudf.DataFrame, test_df: cudf.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True) with timer("label encoding"): with timer("rating"): rating_dict = { "RP": 0, "EC": 1, "K-A": 2, "E": 2, "E10+": 3, "T": 4, "M": 5, "AO": 5, } total["Rating"] = total["Rating"].replace(rating_dict).astype( int) with timer("other cat cols"): cat_cols = [ "Name", "Platform", "Genre", "Publisher", "Developer", ] for col in cat_cols: le = LabelEncoder(handle_unknown="ignore") le.fit(total[col]) total[col] = le.transform(total[col]).astype("category") with timer("User_Score"): total["User_Score"] = (total["User_Score"].replace( to_replace="tbd", value=np.nan).astype(float)) with timer("Year_of_Release"): total["Year_of_Release"] = total["Year_of_Release"].replace( to_replace=2020.0, value=2017.0) with timer("log_User_Count"): total["log_User_Count"] = np.log1p(total["User_Count"].to_pandas()) with timer("end"): basic_cols = [ "Name", "Platform", "Year_of_Release", "Genre", "Publisher", "Critic_Score", "Critic_Count", "User_Score", "User_Count", "log_User_Count", "Developer", "Rating", ] target_cols = [ "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales", ] self.train = total[basic_cols + target_cols].iloc[:len_train].reset_index( drop=True) self.test = total[basic_cols].iloc[len_train:].reset_index( drop=True)