target_encoded_features = list( df_stat[(df_stat.unique_values > 2) & (df_stat.unique_values <= 200)]['index']) # In[12]: target_encoded_features # In[19]: import category_encoders as ce # In[20]: target_encoder = ce.TargetEncoder(cols=target_encoded_features) # In[21]: target_encoder.fit(df, y=df.loan_status) # In[22]: encoded_df = target_encoder.transform(df) # In[23]: encoded_df.head() # In[24]:
import category_encoders as ce import sys sys.path.append('../encoders/') from ceng import CENGEncoder from cesamo import CESAMOEncoder from entity_embedding import EntityEmbeddingEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(), 'AgingPP': AgingPPEncoder(), 'SimplePP': SimplePPEncoder(), 'CESAMOEncoder': CESAMOEncoder() } """END: Import encoders""" """START: Import models""" try: import sklearn.linear_model as lm import sklearn.svm as svm from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.gaussian_process.kernels import RBF
# We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder() ] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values) for encoder in encoders: print("Encoding:", dataset_name, y.name, encoder.__class__.__name__) folds, fit_encoder_time, score_encoder_time = train_encoder( X, y, fold_count, encoder)
text_cols = dta.select_dtypes('object') text_cols = text_cols.apply(lambda col: col.str.replace('\\W+', '_'), axis = 1) dta = pd.concat([numeric_cols, text_cols], axis = 1, ignore_index = False) dta['msg_construction_year'] = dta.construction_year.isnull().astype(int) dta['msg_population'] = dta.population.isnull().astype(int) #%% separate data again train = dta.loc[dta.id.isin(train.id)] test = dta.loc[dta.id.isin(test.id)] #%% categorical encoding and imputation targ_enc = ce.TargetEncoder(cols = None) targ_enc.fit(train, train_labels['status_group']) train = targ_enc.transform(train) imp = IterativeImputer(max_iter=10, max_value = 2013) imp.fit(train) train = pd.DataFrame(imp.transform(train), columns = train.columns) train['construction_year'] = train['construction_year'].round(0) train['permit'] = train['permit'].round(0) #%% rf rf = RandomForestClassifier(n_estimators = 250,
label_transformed = label_encoder.fit_transform(df_bank) print('computation time of label:', time.time() - start_time) print('Memory usage after encoding: ', round(label_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #hash encoding with md5 hash function start_time = time.time() hash_encoder = ce.HashingEncoder(cols=cat_cols_bank, n_components=9) hash_transformed = hash_encoder.fit_transform(df_bank) print('computation time of hash:', time.time() - start_time) print('Memory usage after encoding: ', round(hash_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #target encoding start_time = time.time() target_encoder = ce.TargetEncoder(cols=cat_cols_bank, smoothing=1) mean_target_transformed = target_encoder.fit_transform(df_bank[cat_cols_bank], df_bank['y']) print('computation time of target:', time.time() - start_time) print( 'Memory usage after encoding: ', round( mean_target_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #WoE start_time = time.time() woe_encoder = ce.WOEEncoder(cols=cat_cols_bank) woe_encoder_transformed = woe_encoder.fit_transform(df_bank[cat_cols_bank], df_bank['y']) print('computation time of WOE :', time.time() - start_time)
train_y = train['target'] test_id = test['id'] train.drop(['target', 'id'], axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from sklearn.metrics import roc_auc_score cat_feat_to_encode = train.columns.tolist() smoothing = 0.20 import category_encoders as ce oof = pd.DataFrame([]) from sklearn.model_selection import StratifiedKFold for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=2020, shuffle=True).split(train, train_y): ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train.iloc[tr_idx, :], train_y.iloc[tr_idx]) oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False) ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train, train_y) train = oof.sort_index() test = ce_target_encoder.transform(test) from sklearn import linear_model glm = linear_model.LogisticRegression(random_state=1, solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none',
def target_encode(X, y, categorical_columns): ce_target_encoder = ce.TargetEncoder(cols=categorical_columns, min_samples_leaf=10) ce_target_encoder.fit(X, y) X = ce_target_encoder.transform(X) return X, ce_target_encoder
'color_72', 'color_73', 'color_74', 'color_75', 'color_76', 'color_77', 'color_78', 'color_79', 'color_80', 'color_81', 'color_82', 'color_83', 'color_84', 'color_85', 'color_86', 'color_87', 'color_88', 'color_89', 'color_90', 'color_91', 'color_92', 'color_93', 'color_94', 'color_95', 'color_96', 'color_97', 'color_98' ] ] doPermutationTests(X, y, features, 'sum') encoder = ce.LeaveOneOutEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'leaveoneout')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'leaveoneout') encoder = ce.TargetEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'target')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'target') encoder = ce.OrdinalEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'ordinal')) features = ['diameter', 'color'] doPermutationTests(X, y, features, 'ordinal') encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) results.append(doAccuracyTests(X, y, 'woe')) features = ['diameter', 'color']
def test_check_preprocessing_1(self): """ Test check preprocessing on multiple preprocessing """ train = pd.DataFrame({'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other']}) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder(cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder(cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder(cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict = dict() input_dict['col'] = 'state' input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR']) input_dict['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('onehot', skp.OneHotEncoder(), ['city', 'state']) ], remainder='drop') enc.fit(train, y) wrong_prepro = skp.OneHotEncoder().fit(train, y) check_preprocessing([enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict]) for preprocessing in [enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target]: check_preprocessing(preprocessing) check_preprocessing(input_dict2) check_preprocessing(enc) check_preprocessing(None) with self.assertRaises(Exception): check_preprocessing(wrong_prepro)
# %% execution={"iopub.execute_input": "2020-10-08T14:24:52.214085Z", "iopub.status.busy": "2020-10-08T14:24:52.213088Z", "iopub.status.idle": "2020-10-08T14:24:52.385628Z", "shell.execute_reply": "2020-10-08T14:24:52.384657Z", "shell.execute_reply.started": "2020-10-08T14:24:52.213088Z"} id="kcMLnvJxZIuD" # One hot encoding for low cardinality feature + Brand col_to_encode = ['Location', 'Fuel_Type', 'Brand'] oh_encoder = ce.OneHotEncoder(cols=col_to_encode, use_cat_names=True) oh_encoder.fit(X_train) # Encoding train set X_train = oh_encoder.transform(X_train) # Encoding test set X_test = oh_encoder.transform(X_test) # %% # Target encoding for high cardinality feature col_to_encode = X_train.select_dtypes("object").columns encoder = ce.TargetEncoder(cols=col_to_encode) encoder.fit(X_train, y_train) # Encoding train set X_train = encoder.transform(X_train) # Encoding test set X_test = encoder.transform(X_test) # %% [markdown] id="bw10NXJkIuLs" # ## Feature Selection # %% id="7u-fc0svIuLt" # Memfilter feature dengan korelasi tinggi corr_price = X_train.join(y_train).corr()['Price'] index = corr_price[(corr_price < -0.20) | (corr_price > 0.20)].index
df=job.sample(frac=1, random_state=12) #%% different embedding # one-hot encoding one_hot_encoder=ce.OneHotEncoder(cols=['Job']) df_one_hot_transformed=one_hot_encoder.fit_transform(df) print(df_one_hot_transformed.iloc[0:7,]) # label encode label_encoder=ce.OrdinalEncoder(cols=['Job']) df_label_transformed=label_encoder.fit_transform(df) print(df_label_transformed.iloc[0:7,]) #hash encoding with md5 hash function hash_encoder=ce.HashingEncoder(cols=['Job'],n_components=7) hash_transformed=hash_encoder.fit_transform(df) print(hash_transformed.iloc[0:7,]) #target encoding target_encoder=ce.TargetEncoder(cols='Job',smoothing=1) mean_target_transformed=target_encoder.fit_transform(df['Job'],df['Target']) print(mean_target_transformed.iloc[0:7,]) #WoE woe_encoder=ce.WOEEncoder(cols='Job') woe_encoder_transformed=woe_encoder.fit_transform(df['Job'],df['Target']) print(woe_encoder_transformed.iloc[0:7,]) y=df[df['Job']=='student']
y = df_car[target_column].values.ravel() X = df_car.drop(columns_to_drop, axis=1) # Create Train Test Split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123) ######################################################################################################################## # Make Pipeline ######################################################################################################################## # Instantiate Transformers scaler = RobustScaler() encoder = ce.TargetEncoder(cols=cat_columns) # Add Transformers to Pipeline num_transformer = make_pipeline(scaler) cat_transformer = make_pipeline(encoder) # Create Preprocessing Pipeline preprocessor = ColumnTransformer( transformers=[("num", num_transformer, num_columns), ("cat", cat_transformer, cat_columns)]) # Model model_lr = LogisticRegression(class_weight='balanced', solver='lbfgs', random_state=123, max_iter=10_000)
def transform_data(X, y=None, test=False): """ Preparing final dataset with all features. Arguments --- X - dataframe with preprocessed features and target variable test - boolean; if false, it means X is the training set If true, it means X is the test set """ config = load_yaml("./config.yaml") columns = list(X.columns) log_cols = config["transform"]["log_cols"] log1p_cols = config["transform"]["log1p_cols"] boxcox1p_cols = config["transform"]["boxcox1p_cols"] onehot_cols = config["transform"]["onehot_cols"] targetencode_cols = config["transform"]["targetencode_cols"] log_target = config["transform"]["log_target"] # generate time features (only relevant for time series) # TODO: make datetime column identifiable from config file if "timestamp" in columns: X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S") # adjust the desirable format accordingly X["hour"] = X.timestamp.dt.hour X["weekday"] = X.timestamp.dt.weekday if not test: X.sort_values("timestamp", inplace=True) X.reset_index(drop=True, inplace=True) # TODO: make cols identified from config file if log_cols: for col in log_cols: # this will replace the columns with their log values X[col] = np.log(X[col]) if log1p_cols: for col in log1p_cols: # this will replace the columns with their log1p values X[col] = np.log1p(X[col]) if boxcox1p_cols: for col in boxcox1p_cols: if col in columns: print("taking the log of " + str(col)) # this will replace the columns with their boxcox1p values X[col] = boxcox1p(X[col], 0.15) # robust scaler numeric_cols = X.select_dtypes(include=np.number).columns.tolist() if not test: global robust_scaler robust_scaler = RobustScaler() robust_scaler.fit_transform(X[numeric_cols]) else: robust_scaler.transform(X[numeric_cols]) # transforming target if log_target and not test: y = np.log1p(y) # target encoding if targetencode_cols: if not test: global target_encoder target_encoder = ce.TargetEncoder(cols=targetencode_cols) X = target_encoder.fit_transform(X, y) else: X = target_encoder.transform(X) if test: return X else: return X, y
def test_inverse_transform_26(self): """ Test multiple dict encoding """ train = pd.DataFrame({ 'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other'] }) test = pd.DataFrame( { 'Onehot1': ['A', 'B', 'A'], 'Onehot2': ['C', 'D', 'ZZ'], 'Binary1': ['E', 'F', 'F'], 'Binary2': ['G', 'H', 'ZZ'], 'Ordinal1': ['I', 'J', 'J'], 'Ordinal2': ['K', 'L', 'ZZ'], 'BaseN1': ['M', 'N', 'N'], 'BaseN2': ['O', 'P', 'ZZ'], 'Target1': ['Q', 'R', 'R'], 'Target2': ['S', 'T', 'ZZ'], 'other': ['other', '123', np.nan] }, index=['index1', 'index2', 'index3']) expected = pd.DataFrame( { 'Onehot1': ['A', 'B', 'A'], 'Onehot2': ['C', 'D', 'missing'], 'Binary1': ['E', 'F', 'F'], 'Binary2': ['G', 'H', 'missing'], 'Ordinal1': ['I', 'J', 'J'], 'Ordinal2': ['K', 'L', 'missing'], 'BaseN1': ['M', 'N', 'N'], 'BaseN2': ['O', 'P', np.nan], 'Target1': ['Q', 'R', 'R'], 'Target2': ['S', 'T', 'NaN'], 'other': ['other', '123', np.nan] }, index=['index1', 'index2', 'index3']) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder( cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder( cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder( cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit( train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] result1 = enc_onehot.transform(test) result2 = enc_binary.transform(result1) result3 = enc_ordinal.transform(result2) result4 = enc_basen.transform(result3) result5 = enc_target.transform(result4) original = inverse_transform(result5, [ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict ]) pd.testing.assert_frame_equal(expected, original)
Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2), 'AgingPP': AgingPPEncoder(estimator_name='LinearRegression', num_predictors=2), 'SimplePP': SimplePPEncoder(estimator_name='LinearRegression', num_predictors=2), 'CESAMOEncoder': CESAMOEncoder() } if target_flag == 0:
def target_encode(df, target): encoder = category_encoders.TargetEncoder(cols=list(df)) encoder.fit(df, target) df_targ_enc = encoder.transform(df) return df_targ_enc
X_test = df[(Fecha_inicial_test <= df['Fc.Corte']) & (df['Fc.Corte'] < Fecha_final_test)] y_train = df[list_targets[0]][X_train.index] y_test = df[list_targets[0]][X_test.index] #Los datos de prueba final seran del año 2020 X_prueba_campo = df[(Fecha_inicial_prueba <= df['Fc.Corte']) & (df['Fc.Corte'] < Fecha_final_prueba)] y_prueba_campo = df[list_targets[0]][X_prueba_campo.index] X = df[list_predictors] X_muestras = Data_Muestras[list_predictors] y_muestras = Data_Muestras[list_targets[0]] #Separamos las variables categoricas train = X_train[predictores_categoricos] test = X_test[predictores_categoricos] #Se les aplica el encoder encoder_mean = ce.TargetEncoder(cols=predictores_categoricos, handle_unknown='ignore') OH_cols_train = encoder_mean.fit_transform(train[predictores_categoricos], y_train) OH_cols_test = encoder_mean.transform(test[predictores_categoricos], y_test) # Eliminamos las columnas categoricas de nuestra data para luego remplazarlas con las resultantes del HOE num_X_train = X_train[predictores_numericos] num_X_test = X_test[predictores_numericos] # Concatenable X_train_escalable = pd.concat([num_X_train, OH_cols_train], axis=1) X_test_escalable = pd.concat([num_X_test, OH_cols_test], axis=1) # se escalan los datos numéricos scaler = StandardScaler() Num_X_Scaler_train = pd.DataFrame(scaler.fit_transform(X_train_escalable)) Num_X_Scaler_test = pd.DataFrame(scaler.transform(X_test_escalable)) #Elimina los indices asi que los volvemos a poner Num_X_Scaler_train.index = X_train.index
fig.update_yaxes(title_text='Percent') fig.update_layout(height=500, width=700, title_text='Exited Percentage by Gender', yaxis={'ticksuffix': '%'}) fig.show() # %% [markdown] id="rI5u37Z0mPYj" # Female customers have a higher churn rate (25%) than male customers (16.5%). # %% [markdown] id="7S_v5Qi1nD32" # ### Heatmap Correlation # %% execution={"iopub.execute_input": "2020-10-14T04:15:48.003690Z", "iopub.status.busy": "2020-10-14T04:15:48.002693Z", "iopub.status.idle": "2020-10-14T04:15:48.358742Z", "shell.execute_reply": "2020-10-14T04:15:48.357743Z", "shell.execute_reply.started": "2020-10-14T04:15:48.003690Z"} executionInfo={"elapsed": 76410, "status": "ok", "timestamp": 1602616029509, "user": {"displayName": "Abdillah Fikri", "photoUrl": "", "userId": "04470220666512949031"}, "user_tz": -420} id="iyvBwv5Goy2Y" outputId="a7ad3ffe-d22a-4b8d-bf87-8fd60b10519e" encoder = ce.TargetEncoder() df_temp = encoder.fit_transform(df.drop(columns='Exited'), df['Exited']) df_corr = df_temp.join(df['Exited']).corr() fig = ff.create_annotated_heatmap(z=df_corr.values, x=list(df_corr.columns), y=list(df_corr.index), annotation_text=df_corr.round(2).values, showscale=True, colorscale='Viridis') fig.update_layout(height=600, width=800, title_text='Feature Correlation') fig.update_xaxes(side='bottom') fig.show() # %% [markdown] id="rOzxBfS7nLJp" # The highest correlation to Target is the Surname feature (0.36), and the second is the Age feature with a value of 0.29.
plot.tick_params(labelsize=7) plt.show() merge_data.drop(['Cabin'], axis=1, inplace=True) # # get Family name # merge_data['Name'] = merge_data['Name'].apply(lambda x: x.split(', ')[1]) merge_data['familyname'] = merge_data['Name'].apply(lambda x: x.split(' ')[1]) familyname_ratio = merge_data.groupby('familyname').mean()[[ 'Survived' ]].sort_values(by='Survived', ascending=False) sns.barplot(familyname_ratio.index, familyname_ratio.Survived) plt.show() # --> have huge difference # drop Name, familyname and add survival ratio by familyname --> Target encoding instead of familyname target_encoder = ce.TargetEncoder() merge_data['survival_ratio'] = target_encoder.transform( merge_data.familyname, merge_data.Survived).values merge_data.drop(['Name', 'familyname'], axis=1, inplace=True) # # Age: categorical, Embarked: Categorical # # Age: [,16] [17,40][41,60][61,] # merge_data['Age'] = pd.cut(merge_data.Age,bins=[0,16,40,60,120],labels=['kid','youth','middle-aged','elderly']) # # Age labeling---------/ merge_data = labeling(merge_data, ['Pclass']) merge_data = labeling(merge_data, ['Sex']) # merge_data = labeling(merge_data, ['Ticket']) merge_data = labeling(merge_data, ['Ticket_alpha']) # merge_data = labeling(merge_data, ['Cabin'])
data = pd.read_csv('train.csv') test = pd.read_csv('test.csv') data.drop('id',axis=1,inplace=True) test.drop('id',axis=1,inplace=True) target = data['target'] train = data.drop('target',axis=1) train = train.applymap(str) test = test.applymap(str) encoding_cols = train.columns encoded = pd.DataFrame([]) for tr_in,fold_in in StratifiedKFold(n_splits=12, shuffle=True).split(train, target): encoder = ce.TargetEncoder(cols = encoding_cols, smoothing=0.2) encoder.fit(train.iloc[tr_in,:],target.iloc[tr_in]) encoded = encoded.append(encoder.transform(train.iloc[fold_in,:]),ignore_index=False) encoder = ce.TargetEncoder(cols = encoding_cols,smoothing=0.2) encoder.fit(train,target) test = encoder.transform(test) train = encoded.sort_index() best_params = {'bagging_temperature': 0.8, 'depth': 5, 'iterations': 1000, 'l2_leaf_reg': 30, 'learning_rate': 0.05, 'random_strength': 0.8} n_splits = 12
df = df.iloc[:, :-1] df = df.dropna() # encode the customer response column for i in range(len(df)): if df['Customer Response'].values[i] == 'Won': df['Customer Response'].values[i] = 1 else: df['Customer Response'].values[i] = 0 df = df.rename(columns={'Contract Status': 'Contract Status'}) # Cateogrical Encoding -------------------------------------------------- import category_encoders as ce cat_features = ['Contract Status'] target_enc = ce.TargetEncoder(cols=cat_features) features = [ 'Contract Status', 'Product Family', 'FTS Rate', 'Forecast', 'Market Share', 'Number of Competitors', 'WAC Price' ] y = df['Customer Response'].astype('int64') X = df[features] from sklearn.model_selection import train_test_split # split data into training and validation data, for both features and target train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) # Fit the encoder using the categorical features and target target_enc.fit(train_X[cat_features], train_y)
train_columns = np.concatenate([train.columns.values, train_scaled.columns.values], axis= 0) train = pd.DataFrame(np.hstack([train, train_scaled]),columns=train_columns) y = train.loc[:, ['Income in EUR']].astype('int') X = train.copy() X.drop(['Instance','Income in EUR'],axis=1, inplace=True) """ Encoding """ encoder = ce.TargetEncoder(cols=['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession']) encoder.fit(X[['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession']], y) X_cleaned = encoder.transform(X[['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession']], y) X.drop(['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession'], axis =1, inplace = True) X_columns = np.concatenate([X.columns.values, X_cleaned.columns.values], axis= 0) X = pd.DataFrame(np.hstack([X, X_cleaned]),columns=X_columns) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=1) #----------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------- # Import the model we are using from sklearn.ensemble import RandomForestRegressor
def feat_eng(rossman_featured, train_data=True, te_store=None, te_week=None, te_day=None): """ This function perform some feature engineering tasks having as input a pandas dataframe. - From the Date column, 3 news features (year, day of the month and week of the year) are created and when necessary they have their type changed. - A new feature is obtained dividing the Sales per Customer per Store. - One hot encoding and target encoding techniques are applied to some categorical features. - Some features are dropped. ... Attributes ---------- rossman_featured : pandas.core.frame.DataFrame """ # create a new features from Date column rossman_featured['Year'] = rossman_featured['Date'].dt.year # year #rossman_featured['Month'] = rossman_featured['Date'].dt.month # month rossman_featured['Day'] = rossman_featured[ 'Date'].dt.day # day of the month rossman_featured['WeekofYear'] = rossman_featured['Date'].dt.isocalendar( ).week # week of the year # convert Week of the Year to integer rossman_featured['WeekofYear'] = rossman_featured['WeekofYear'].astype(int) # apply one hot encoding to some features rossman_featured = pd.get_dummies(data=rossman_featured, columns=[ 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'PromoInterval', 'DayOfWeek' ], prefix=[ 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'PromoInterval', 'DayOfWeek' ], prefix_sep='_') if train_data == True: # apply target encoding to the feature Store te_store = ce.TargetEncoder(cols=['Store']) rossman_featured['Store_target'] = te_store.fit_transform( rossman_featured['Store'], rossman_featured['Sales']) # apply target encoding to the feature WeekofYear te_week = ce.TargetEncoder(cols=['WeekofYear']) rossman_featured['WeekofYear_target'] = te_week.fit_transform( rossman_featured['WeekofYear'], rossman_featured['Sales']) # apply target encoding to the feature day of the month te_day = ce.TargetEncoder(cols=['Day']) rossman_featured['Day_target'] = te_day.fit_transform( rossman_featured['Day'], rossman_featured['Sales']) else: # apply target encoding to the feature Store rossman_featured['Store_target'] = te_store.transform( rossman_featured['Store']) # apply target encoding to the feature WeekofYear rossman_featured['WeekofYear_target'] = te_week.transform( rossman_featured['WeekofYear']) # apply target encoding to the feature day of the month rossman_featured['Day_target'] = te_day.transform( rossman_featured['Day']) # # create new feature dividing sales per customers and store # rossman_featured['Sales_Cust_Store']= rossman_featured['Sales'] / (rossman_featured['Customers'] * rossman_featured['Store']) # remove chosen features rossman_featured = rossman_featured.drop([ 'Date', 'Store', 'Year', 'WeekofYear', 'Day', 'Customers', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear' ], axis=1) return rossman_featured, te_store, te_week, te_day
y = df.pop('salary') # df.head() # df.shape #%% Preprocessor functions ohe = ce.OneHotEncoder( drop_invariant=True, return_df=True, use_cat_names=True, handle_missing='return_nan') # Remember replace(np.nan, 0) tge = ce.TargetEncoder( drop_invariant=True, return_df=True, handle_missing='value', # min_samples_leaf=3, # smoothing=0.4, ) num_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'] cat_cols = [ 'gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation' ] new_cat_cols = [ 'gender_M', 'gender_F', 'ssc_b_Others', 'ssc_b_Central', 'hsc_b_Others', 'hsc_b_Central', 'hsc_s_Commerce', 'hsc_s_Science', 'hsc_s_Arts', 'degree_t_Sci&Tech', 'degree_t_Comm&Mgmt', 'degree_t_Others', 'workex_No', 'workex_Yes', 'specialisation_Mkt&HR', 'specialisation_Mkt&Fin' ]
valid_df = df[df.kfold==FOLD].reset_index(drop=True) ytrain = train_df.target yvalid = valid_df.target train_df = train_df.drop(["id", "target", "kfold"], axis=1) valid_df = valid_df.drop(["id", "target", "kfold"], axis=1) valid_df = valid_df[train_df.columns] cols_to_enc = train_df.columns.tolist() target_encoders = {} for c in train_df.columns: if c in cols_to_enc: print(c) tge = ce.TargetEncoder(cols=[c]) tge.fit(pd.concat([train_df.loc[:,c],valid_df.loc[:,c]],axis=0), pd.concat([ytrain,yvalid],axis=0)) train_df.loc[:,c] = tge.transform(train_df.loc[:,c]) valid_df.loc[:,c] = tge.transform(valid_df.loc[:,c]) target_encoders[c] = tge # data is ready to train clf = dispatcher.MODELS[MODEL] clf.fit(train_df, ytrain) preds = clf.predict_proba(valid_df)[:, 1] print(metrics.roc_auc_score(yvalid, preds)) joblib.dump(target_encoders, f"models/{MODEL}_{FOLD}_target_encoder.pkl") joblib.dump(clf, f"models/{MODEL}_{FOLD}.pkl") joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl")
def run_i_experiments(): print("Loading Data") df = load_data() #columns: continuous = ['age', 'bmi'] categorical = ['sex', 'children', 'smoker', 'region'] X = df[continuous + categorical] y = df[['charges']] u_0 = np.mean(y)[0] v = np.std(y)[0] models = [ Ridge(), RandomForestRegressor(n_estimators=100), GradientBoostingRegressor(), MLPRegressor() ] #models = [RandomForestRegressor()] results = [[ 'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity', 'Dimensions' ]] for model in models: print("") print("----------------------") print("Testing Algorithm: ") print(type(model)) print("----------------------") #TargetEncoder print("TargetEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.TargetEncoder(return_df=False)) results.append([ type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions ]) #OrdinalEncoder print("OrdinalEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.OrdinalEncoder(return_df=False)) results.append([ type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions ]) #BinaryEncoder print("BinaryEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.BinaryEncoder(return_df=False)) results.append([ type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions ]) #HashingEncoder print("HashingEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.HashingEncoder(return_df=False)) results.append([ type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions ]) #OneHotEncoder print("OneHotEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=OneHotEncoder(handle_unknown='ignore', sparse=False)) results.append([ type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean) Results:") r2, std, time, sparsity, dimensions = cv_regression(model, X, y, continuous, categorical, encoder=GIGEncoder( u_0=u_0, v=v)) results.append([ type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean and variance) Results:") r2, std, time, sparsity, dimensions = cv_regression(model, X, y, continuous, categorical, encoder=GIGEncoder( u_0=u_0, v=v), moments='mv') results.append([ type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions ]) file = 'insurance_experiments.csv' with open(file, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(results) try: upload_file(file) except: print("File Not Uploaded")
#from sklearn.ensemble import RandomForestClassifier #%% read data train = pd.read_csv("training_set_features.csv") test = pd.read_csv("test_set_features.csv") train_labels = pd.read_csv("training_set_labels.csv") drop_vars = [ 'respondent_id', 'child_under_6_months', 'behavioral_wash_hands', 'behavioral_face_mask', 'behavioral_antiviral_meds' ] #%% encoding h1n1_targ_enc = ce.TargetEncoder() h1n1_targ_enc.fit(train.drop(drop_vars, axis=1), train_labels['h1n1_vaccine']) h1n1_train = h1n1_targ_enc.transform(train.drop(drop_vars, axis=1)) seasonal_targ_enc = ce.TargetEncoder() seasonal_targ_enc.fit(train.drop(drop_vars, axis=1), train_labels['seasonal_vaccine']) seasonal_train = h1n1_targ_enc.transform(train.drop(drop_vars, axis=1)) #%% imputation h1n1_imp = IterativeImputer(max_iter=20, min_value=0) h1n1_imp.fit(h1n1_train) h1n1_train = pd.DataFrame(h1n1_imp.transform(h1n1_train), columns=h1n1_train.columns)
def norm_data(X_train, X_test, y_train, y_test, real=None, categ=None, all=True): '''Preprocessing features''' # ------------- Split data on real and categ ----------------- X_train_categ = np.hstack((X_train[:, :2], X_train[:, 81:82])) X_test_categ = np.hstack((X_test[:, :2], X_test[:, 81:82])) X_train_real = np.hstack((X_train[:, 2:81], X_train[:, 82:])) X_test_real = np.hstack((X_test[:, 2:81], X_test[:, 82:])) # ------- Check flag that we want to use all data for encoding -------- if all == True: X_all_categ = np.append(X_train_categ, X_test_categ, axis=0) #print (X.shape, X_train_categ.shape, X_test_categ.shape) y_all = np.append(y_train, y_test, axis=0) #print (y_all.shape, y_train.shape, y_test.shape) else: X_all_categ = X_train_categ y_all = y_train # ------- Norm of real data on mean and deviation -------- if real == 'standart': ss = StandardScaler() X_train_real_res = ss.fit_transform(X_train_real) X_test_real_res = ss.transform(X_test_real) elif real == 'normal': min_max_scaler = preprocessing.MinMaxScaler() X_train_real_res = min_max_scaler.fit_transform(X_train_real) X_test_real_res = min_max_scaler.transform(X_test_real) else: X_train_real_res = X_train_real X_test_real_res = X_test_real # ------- Encoding of categorical features ----------- if categ == 'target': encoder = ce.TargetEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'onehot': encoder = ce.OneHotEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'helmert': encoder = ce.HelmertEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'hash': encoder = ce.HashingEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) else: X_train_categ_res = X_train_categ X_test_categ_res = X_test_categ # ------------ Joy data together --------------- X_train_ready = np.hstack((X_train_categ_res, X_train_real_res)) X_test_ready = np.hstack((X_test_categ_res, X_test_real_res)) return X_train_ready, X_test_ready
'pref_month_m6_5', 'pref_month_y1_1', 'pref_month_y1_2', 'pref_month_y1_3', 'pref_month_y1_4', 'pref_month_y1_5', 'pref_month_y2_1', 'pref_month_y2_2', 'pref_month_y2_3', 'pref_month_y2_4', 'pref_month_y2_5', 'pref_month_y3_1', 'pref_month_y3_2', 'pref_month_y3_3', 'pref_month_y3_4', 'pref_month_y3_5', 'recent_flt_day', 'pit_add_chnl_m3', 'pit_add_chnl_m6', 'pit_add_chnl_y1', 'pit_add_chnl_y2', 'pit_add_chnl_y3', 'pref_orig_city_m3', 'pref_orig_city_m6', 'pref_orig_city_y1', 'pref_orig_city_y2', 'pref_orig_city_y3', 'pref_dest_city_m3', 'pref_dest_city_m6', 'pref_dest_city_y1', 'pref_dest_city_y2', 'pref_dest_city_y3' , 'seg_dep_time_month', 'seg_dep_time_year', 'seg_dep_time_is_workday' ] continue_list = list(set(feature_list) - set(discrete_list)) print('特征列表长度为{0},离散特征长度{1},连续特征长度{2}'.format(len(feature_list), len(discrete_list), len(continue_list))) # 将离散数据进行target_encoding encoder = ce.TargetEncoder(cols=discrete_list, drop_invariant=False).fit(data_train_feature, data_target) data_train_feature = encoder.transform(data_train_feature).to_numpy() train_test_split = getTrainTest(data_train_feature, data_target) for train_index, test_index in train_test_split: np.random.seed(0) obj = Data(data_train_feature, data_target, train_index) obj2 = Test_Data(data_train_feature, data_target, train_index, test_index) pso = PSO(iterations=100, obj=obj, beta=0.2, alpha=0.4) pso.run() print('得到的特征子集序列为', pso.best.getPBest()) print('特征子集长度为', len(set(pso.best.getPBest()))) print('训练集准确率(适应度)为', pso.best.getCostPBest()) print('得到的测试集准确率为', obj2.getTestAccuracy(pso.best.getPBest())) print('得到的测试集F1值为', obj2.getTestF1(pso.best.getPBest()))
def run_bs_experiments(): print("Loading Data") df = load_data() #columns: continuous = ['temp', 'atemp', 'hum', 'windspeed'] categorical = [ 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit' ] X = df[continuous + categorical] y = df[['cnt']] models = [ Ridge(), RandomForestRegressor(n_estimators=100), GradientBoostingRegressor(), MLPRegressor() ] #models = [RandomForestRegressor()] results = [[ 'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity', 'Dimensions' ]] for model in models: print("") print("----------------------") print("Testing Algorithm: ") print(type(model)) print("----------------------") #TargetEncoder print("TargetEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.TargetEncoder(return_df=False)) results.append([ type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions ]) #OrdinalEncoder print("OrdinalEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.OrdinalEncoder(return_df=False)) results.append([ type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions ]) #BinaryEncoder print("BinaryEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.BinaryEncoder(return_df=False)) results.append([ type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions ]) #HashingEncoder print("HashingEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=ce.HashingEncoder(return_df=False)) results.append([ type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions ]) #OneHotEncoder print("OneHotEncoder Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=OneHotEncoder(handle_unknown='ignore', sparse=False)) results.append([ type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean) Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=GIGEncoder()) results.append([ type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions ]) print("GIG Encoder (mean and variance Results:") r2, std, time, sparsity, dimensions = cv_regression( model, X, y, continuous, categorical, encoder=GIGEncoder(), moments='mv') results.append([ type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions ]) file = 'bike_sharing_experiments.csv' with open(file, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(results) try: upload_file(file) except: print("File Not Uploaded")