def TargetEncode(data,target): #Select all categorical columns data_to_encode=data.select_dtypes(include=['object']) print('Data to be encoded: ') cols=list(data_to_encode.columns) print(len(cols)) cols='\n'.join(cols) print(cols) print('\n') print('\n') #For each column, encode using target encoder cols=list(data_to_encode.columns) model=TargetEncoder().fit(X=data[cols],y=data[target]) #File where the target encoding model is saved filename="targetencodemodel.sav" #Open file in binary mode f=open(filename,'wb') #Dump model to file pickle.dump(model,f) f.close() print("Model saved in ",filename) print("\n") print("\n") #Read Model from file # f=open(filename,'rb') # model1=pickle.load(f) # f.close() # print("Model Loaded") # print(model1) # print('\n') data[cols]=model.transform(X=data[cols]) #Return encoded data return data
def target_encoding(train, target, test=None, feat_to_encode=None, smooth=0.2, random_state=9527): print('Target encoding...') train.sort_index(inplace=True) target = train.pop(target) if feat_to_encode is None: feat_to_encode = train.columns.tolist() smoothing = smooth oof = pd.DataFrame([]) for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True).split(train, target): ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx]) oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False) ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train, target) train = oof.sort_index() if test is not None: test = ce_target_encoder.transform(test) features = list(train) print('Target encoding done!') return train, test, features, target
def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # 使用这种方式无法报错, 可以换一种方法, 删除相似度为 1 的特征 # drop duplicate column # self.__train_feature = self.__train_feature.T.drop_duplicates().T # self.__test_feature = self.__test_feature[self.__train_feature.columns.tolist()] # encoder self.__categorical_columns = (self.__train_feature.select_dtypes( include="object").columns.tolist()) self.__train_feature[self.__categorical_columns] = ( self.__train_feature[self.__categorical_columns].fillna("missing")) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) for col in self.__train_feature.columns.tolist(): if self.__train_feature[col].std() == 0.: print(col) self.__remove_feature.append(col)
def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_train.csv")) self.__train_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_train.csv")) self.__train_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_train.csv")) self.__train_feature_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_train_feature.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) self.__train_feature = pd.concat([ self.__train_feature, self.__train_feature_stacking_tree, self.__train_feature_stacking_linear, self.__train_feature_stacking_network ], axis=1)
def data_prepare(self): self.__sample_submission = pd.read_csv( os.path.join(self.__input_path, "sample_submission.csv")) # selected feature self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit( self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature.loc[:, self.__categorical_columns])) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__test_feature.loc[:, self.__categorical_columns])) del self.__train, self.__test, self.__categorical_columns, self.__encoder gc.collect()
def prepare_df(df, columns, target): ''' Prepares a pd.DataFrame by turning missing scikit-learn preprocessors into "None" strings and performs target encoding at the input columns. Parameters: ----------- df: pd.DataFrame Contains a pd.DataFrame with the generated meta-data. columns: list Contains a list with the columns that contain scikit-learn estimators and scikit-learn preprocessors. target: str Contains a string that represents the name of the column that is the target of the dataset. Returns: -------- pd.DataFrame Contains adjusted pd.DataFrame. ''' df = deepcopy(df) df = df.reset_index(drop=True) df = df.drop_duplicates() y = df[target] for column in ['component_1', 'component_2', 'component_3']: df[column] = df[column].apply(lambda x: nan_to_none(x)) for column in columns: df[column] = df[column].astype('category') df['{}_codes'.format(column)] = df[column].cat.codes enc = TargetEncoder(cols=[column]) df['{}_encoded'.format(column)] = enc.fit_transform(df[column], y) return df
def data_prepare(self): self.__train_feature_before = pd.read_csv( os.path.join(self.__input_path, "train_feature_before_df.csv")) self.__train_feature_after = pd.read_csv( os.path.join(self.__input_path, "train_feature_after_df.csv")) self.__train = pd.concat( [self.__train_feature_before, self.__train_feature_after]) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"].copy() self.__train_feature = (self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1)).copy() self.__test_feature = self.__test[ self.__train_feature.columns.tolist()].copy() self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() encoder = TargetEncoder() encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = encoder.transform( self.__train_feature[self.__categorical_columns])
class DFMeanEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = TargetEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat( [new_X, self.model.transform(X[self.transform_cols])], axis=1) return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
def __init__(self): self.mode_imputer = SimpleImputer(strategy="most_frequent") self.cat_cols = [ 'home_ownership', 'purpose', 'addr_state', 'initial_list_status' ] self.target_encoder = TargetEncoder(handle_missing='return_nan', handle_unknown='return_nan')
def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown)
def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # drop column na self.__train_feature = self.__train_feature[list( (self.__train_feature.isna().sum() / self.__train_feature.isna().count() )[(self.__train_feature.isna().sum() / self.__train_feature.isna().count()) < 0.2].index)] self.__test_feature = self.__test_feature[ self.__train_feature.columns.tolist()] # columns 而不是 index self.__categorical_index = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__numeric_index = self.__train_feature.select_dtypes( exclude="object").columns.tolist() # filler Imputer all np.nan remove column self.__filler = Imputer(strategy="median") self.__filler.fit(self.__train_feature[self.__numeric_index]) self.__train_feature[self.__numeric_index] = self.__filler.transform( self.__train_feature[self.__numeric_index]) self.__test_feature[self.__numeric_index] = self.__filler.transform( self.__test_feature[self.__numeric_index]) # encoder self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_index], self.__train_label) self.__train_feature[ self.__categorical_index] = self.__encoder.transform( self.__train_feature[self.__categorical_index]) self.__test_feature[ self.__categorical_index] = self.__encoder.transform( self.__test_feature[self.__categorical_index]) # scaler pandas in numpy out self.__scaler = MinMaxScaler() self.__scaler.fit(self.__train_feature) self.__train_feature = pd.DataFrame( self.__scaler.transform(self.__train_feature), columns=self.__train_feature.columns) self.__test_feature = pd.DataFrame(self.__scaler.transform( self.__test_feature), columns=self.__test_feature.columns)
def data_prepare(self): self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature, self.__train_label) self.__train_feature = self.__encoder.transform(self.__train_feature) self.__pca = PCA(n_components=2, random_state=7) self.__train_feature = self.__pca.fit_transform(self.__train_feature) self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"])
class CategoricalPreprocessing(BaseEstimator, TransformerMixin): def __init__(self): self.mode_imputer = SimpleImputer(strategy="most_frequent") self.cat_cols = [ 'home_ownership', 'purpose', 'addr_state', 'initial_list_status' ] self.target_encoder = TargetEncoder(handle_missing='return_nan', handle_unknown='return_nan') def fit(self, X, y=None): self.mode_imputer.fit(X[self.cat_cols]) self.target_encoder.fit(X["zip_code"], y) return self def transform(self, X, y=None): Xc = X.copy() # encode emp_length lookup = { '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10 } Xc["emp_length"] = Xc["emp_length"].replace(lookup) # issue date Xc["issue_d"] = pd.to_datetime(Xc["issue_d"]) tmp = Xc[ "issue_d"].values # keep a copy of the raw date for when we transform earliest credit line Xc["issue_d"] = ( Xc["issue_d"] - datetime.datetime(2000, 1, 1)).astype('timedelta64[M]') # earliest credit line Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"]) Xc["earliest_cr_line"] = ( tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]') # imputation for home_ownership, purpose, addr_state, and initial_list_status Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols]) # encode zip code Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"]) return Xc def fit_transform(self, X, y=None): return self.fit(X, y).transform(X)
def _create_feature(cls, conf) -> pd.DataFrame: df = Base.get_df(conf) df = df.merge(CreditCardBalance.get_df(conf), on="SK_ID_CURR", how="left") # fit with train data and transform with both date train_df = df[df['TARGET'].notnull()].copy() categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] df = TargetEncoder(cols=categorical_columns).fit(train_df, train_df['TARGET']).transform(df) df = df.groupby(by=['SK_ID_CURR'], as_index=False).agg({col: 'mean' for col in categorical_columns}) return df[categorical_columns + ['SK_ID_CURR']].rename( columns={col: f"{col}_target_encode" for col in categorical_columns} )
def data_prepare(self): self.__train = pd.read_csv(os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop( ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform(self.__train_feature[self.__categorical_columns]) )
def categorical_encoding(df_X, y, cat_vars, id_train, method=None): if method is None: return df_X.values, df_X.columns target_enc = TargetEncoder(cols=cat_vars, drop_invariant=False, return_df=True, impute_missing=False, handle_unknown='error') target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train]) df_X = target_enc.transform(df_X) return df_X.values, df_X.columns
def transform(self, X): if self.aliases: X[self.aliases] = X[self.cols] self.cols = self.aliases t_enc = TargetEncoder(cols=self.cols) X = t_enc.fit_transform(X, X[self.target_col]) if not self.ordinal_transform: return X o_enc = OrdinalEncoder() X[self.cols] = o_enc.fit_transform(X[self.cols]) return X
def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv")) self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv")) self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv")) self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv")) self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[self.__train_feature.columns] self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1) self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1) self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0] self.__train_feature.iloc[:, self.__categorical_index] = ( self.__train_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__test_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label) self.__train_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index]) ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index]) ) # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset" self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0] self.__train_feature.iloc[:, self.__numeric_index] = ( self.__train_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) self.__test_feature.iloc[:, self.__numeric_index] = ( self.__test_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label)
class EntityEmbeddingTree(BaseEstimator, TransformerMixin): def __init__(self, *, numeric_columns, categorical_columns): self.__numeric_columns = numeric_columns self.__categorical_columns = categorical_columns self.__target_encoder, self.__one_hot_encoder = [ None for _ in range(2) ] self.__max_target, self.__max_param = [None for _ in range(2)] self.__clf = None def fit(self, X, y): X = X.copy(deep=True) y = y.copy(deep=True) self.__target_encoder = TargetEncoder() X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.fit_transform( X[self.__categorical_columns], y) self.__max_target, self.__max_param = optimize_rf(X, y) self.__clf = RandomForestClassifier( min_samples_leaf=max( min(self.__max_param["min_samples_leaf"], 1.0), 0), n_estimators=max(int(round(self.__max_param["n_estimators"])), 1)) self.__clf.fit(X, y) gc.collect() return self def transform(self, X): X = X.copy(deep=True) X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.transform( X[self.__categorical_columns]) gc.collect() return pd.DataFrame(self.__clf.apply(X)).astype(str) def fit_transform(self, X, y=None, **fit_params): self.fit(X=X, y=y) return self.transform(X)
def remove_outliers(data, columns_config): quantitative_columns = columns_config["quantitative_columns"] semi_quali_columns = columns_config["semi_quali_columns"] qualitative_columns = columns_config["qualitative_columns"] target_features_list = columns_config["target_features_list"] indicator_features_list = columns_config["indicator_features_list"] pipeline = make_pipeline(ExcludeColumnsTransformer(["Id"]), CreateSumTransformer(target_features_list), CreateOneHotTransformer(indicator_features_list), NewHouseTransformer(), BoxCoxTransformer(quantitative_columns), FillnaMeanTransformer(quantitative_columns), TargetEncoder(semi_quali_columns), SimpleOneHotEncoder(qualitative_columns), NormalizeTransformer(quantitative_columns), FillnaMeanMatrixTransformer()) # Prepare Data Training X = data y = data[['SalePrice']] X = pipeline.fit_transform(X, y) # fit the model clf = IsolationForest(max_samples=100) clf.fit(X) outlier_index = clf.predict(X) clean_df = data[outlier_index == 1].reset_index(inplace=False, drop=True) return clean_df
def mean_encode(columns: Union[List[str], str], targets: Union[List[str], str], smoothing: float = 1.0, min_samples_leaf: int = 1) -> CategoryEncoder: """Performs mean target encoding in parallel An alias to stl.category_encode(TargetEncoder(smoothing, min_samples_leaf), columns, targets). Args: columns: list of encoded columns. Treats string as a list of length 1 targets: list of target columns. Should be provided if encoder uses target. Treats string as a list of length 1 smoothing: smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. The value must be strictly bigger than 0. min_samples_leaf: minimum samples to take category average into account. Returns: A feature constructor performing mean encoding for each pair (column, target) and returning the concatenation. Examples: >>> stl.mean_encoding(['Sex', 'Embarked'], ['Survived', 'Age']) >>> stl.mean_encoding(['Sex', 'Embarked'], 'Survived', smoothing=1.5, min_samples_leaf=5) """ enc = TargetEncoder(smoothing=smoothing, min_samples_leaf=min_samples_leaf) return category_encode(enc, columns=columns, targets=targets)
def gbm_model_crossval(learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda): estimator = Pipeline([ ("ENCODER", ColumnTransformer( [("ORD_ENCODER", OrdinalEncoder(categories="auto"), ord_encoder_columns), ("TAR_ENCODER", TargetEncoder(cols=tar_encoder_columns), tar_encoder_columns)], remainder="drop")), ("LGBMCLF", LGBMClassifier(max_depth=1, learning_rate=learning_rate, n_estimators=np.int(np.round(n_estimators)), subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=7, n_jobs=-1)) ]) cval = cross_val_score(estimator, self.__train_feature, self.__train_label, scoring="roc_auc", cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=7)) return cval.mean()
def plot_1_6(X, y): """ Evaluates 3 classifiers and plots the results in a bar chart. Also compares different category encoders """ classifiers = [ LogisticRegression(random_state=1), SVC(random_state=1), RandomForestClassifier(random_state=1) ] encoders = [ OneHotEncoder(sparse=False, handle_unknown='ignore'), TargetEncoder() ] #results = dict.fromkeys(range(3), dict.fromkeys(range(4),0)) results = [] #same 3 cross-validation folds (shuffle and random_state=1) kf = KFold(n_splits=3, shuffle=True, random_state=1) for clf in classifiers: for encoder in encoders: pipeline = flexible_pipeline(categorical, clf, encoder) result = cross_val_score(pipeline, X, y, cv=kf, n_jobs=-1, scoring='roc_auc') results.append(np.mean(result)) heatmap(['OneHot', 'Target'], ['Logistic', 'SVM', 'Random Forest'], [results[0:2], results[2:4], results[4:6]])
def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__numeric_columns = self.__train_feature.select_dtypes( exclude="object").columns.tolist() self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__imputer = Imputer(strategy="median") self.__imputer.fit(self.__train_feature[self.__numeric_columns]) self.__train_feature[self.__numeric_columns] = ( self.__imputer.transform( self.__train_feature[self.__numeric_columns])) self.__train_feature[self.__categorical_columns] = ( self.__train_feature[self.__categorical_columns].fillna("missing")) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) # 非监督 feature filter self.__unsupervise_selector = VarianceThreshold() self.__unsupervise_selector.fit(self.__train_feature) self.__train_feature = (pd.DataFrame( self.__unsupervise_selector.transform(self.__train_feature), columns=[ i for i, j in zip(self.__train_feature.columns, self.__unsupervise_selector.get_support()) if j == 1 ])) # 监督 feature filter pd.concat([ pd.Series(self.__train_feature.columns).to_frame("feature"), pd.Series( mutual_info_classif(self.__train_feature, self.__train_label)).to_frame("mi") ], axis=1).to_csv(os.path.join(self.__output_path, "train_feature_df_fs_mi.csv"), index=False)
def data_prepare(self): self.__feature_importance = pd.read_csv( os.path.join(self.__input_path, "feature_importance_feature_data_V5.csv")) self.__feature_importance = (self.__feature_importance.groupby([ "feature" ])["importance"].mean().to_frame("importance").reset_index( drop=False)).sort_values("importance", ascending=False).reset_index(drop=True) self.__feature_top_column = list(self.__feature_importance.iloc[0:200, 0]) self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv"), usecols=self.__feature_top_column + ["TARGET"]) self.__test = pd.read_csv(os.path.join(self.__input_path, "test_select_feature_df.csv"), usecols=self.__feature_top_column) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop("TARGET", axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # encoder self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[ self.__categorical_columns] = self.__encoder.transform( self.__train_feature[self.__categorical_columns]) self.__test_feature[ self.__categorical_columns] = self.__encoder.transform( self.__test_feature[self.__categorical_columns]) # filler self.__numeric_columns = self.__train_feature.select_dtypes( exclude="object").columns.tolist() self.__filler = Imputer(strategy="median") self.__filler.fit(self.__train_feature[self.__numeric_columns]) self.__train_feature[self.__numeric_columns] = self.__filler.transform( self.__train_feature[self.__numeric_columns]) self.__test_feature[self.__numeric_columns] = self.__filler.transform( self.__test_feature[self.__numeric_columns])
def fit(self, data: pd.DataFrame): log.info("TargetEncode fit: %s", self.targets) for target in self.targets: self.encoders["enc_{}".format(target)] = TargetEncoder( cols=self.cols, handle_missing="return_nan") log.info("Target encoding fit for target: %s", target) self.encoders["enc_{}".format(target)].fit(data[self.cols], data[target])
def getTestTrainSlipt(self): ## If both testX and testTrainSplit are not passed throw exception. if ((self.testX is None) and (self.testTrainSplit is None)): raise Exception("Please pass testX or testTrainSplit") if (self.targetEncodeCols): for col in self.targetEncodeCols: encoder = TargetEncoder() self.X[col] = encoder.fit_transform(self.X[col]) if (self.testX): self.testX[col] = encoder.fit_transform(self.testX[col]) if (self.testTrainSplit): X_train, X_test, y_train, y_test = train_test_split( self.X, self.Y, test_size=self.testTrainSplit, random_state=7) return X_train, X_test, y_train, y_test else: return self.X, self.testX, self.Y, self.testY
def clean_train_data_target_encoded(data): #uses target encodier instead data = data.reset_index(drop=True) train_y = data.iloc[:,-1] train_y = train_y.reset_index(drop=True) train_X = data.iloc[:,:-1] train_X = process_features(train_X) encoder = TargetEncoder(cols = ["Hair Color", "Wears Glasses","University Degree","Gender","Country","Profession", "Housing Situation", "Satisfation with employer"], smoothing = 300) encoder.fit(train_X,train_y) data2 = pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1) #data2 = data2.fillna(method="ffill") return (data2,encoder)
def frontend_preproc(df, y): ''' Function that produces the preprocessing of the DataFrame before applying the model on the front-end. :df: concat of df_input by the user and X features of the model :y: target ''' ### Feature Engineering ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors'] # OHE ohe = OneHotEncoder(categories='auto') feature_arr = ohe.fit_transform(df[ohe_cols]).toarray() feature_labels = ohe.categories_ # Using a dictionary to produce all the new OHE columns feature_cols = [] for k, v in dict(zip(ohe_cols, feature_labels)).items(): for i in v: el = k + '_' + str(i) feature_cols.append(el) ohe_features = pd.DataFrame(feature_arr, columns=feature_cols) df = pd.concat([df, ohe_features], axis=1) df = df.drop(ohe_cols, axis=1) # Target Encoding cat_cols = df.select_dtypes(exclude=["number"]).columns cols_encoded = list(map(lambda c: c + '_encoded', cat_cols)) t_encoder = TargetEncoder() t_encoder.fit(df[1:][cat_cols], y) df[cols_encoded] = t_encoder.transform(df[cat_cols]) df = df.drop(cat_cols, axis=1) # Column Transformation: QuantileTransformer qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=33) data = qt.fit_transform(df) df = pd.DataFrame(data, columns=df.columns) return df
class ScatterPlot(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path self.__train = None self.__train_feature, self.__train_label = [None for _ in range(2)] self.__encoder = None self.__pca, self.__t_sne = [None for _ in range(2)] def data_read(self): self.__train = pd.read_csv(os.path.join(self.__input_path, "train.csv")) self.__train = self.__train.drop(["id"], axis=1) self.__train_feature, self.__train_label = (self.__train.drop( ["target"], axis=1).copy(deep=True), self.__train["target"].copy(deep=True)) self.__train_feature = self.__train_feature.astype(str) def data_prepare(self): self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature, self.__train_label) self.__train_feature = self.__encoder.transform(self.__train_feature) self.__pca = PCA(n_components=2, random_state=7) self.__train_feature = self.__pca.fit_transform(self.__train_feature) self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) # self.__t_sne = TSNE(verbose=True, random_state=7) # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature) # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) def scatter_plot(self): _, ax = plt.subplots(figsize=(16, 9)) ax = sns.scatterplot(x="col_1", y="col_2", hue=self.__train_label, data=self.__train_feature, ax=ax) ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))