def test_binary(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) enc = encoders.BinaryEncoder(verbose=1, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
def encoder(data): ###### BINARY ENCODER ######' B_encoder = category_encoders.BinaryEncoder(cols=['premis_var']) data = B_encoder.fit_transform(data) B_encoder = category_encoders.BinaryEncoder(cols=['incdt_time']) data = B_encoder.fit_transform(data) B_encoder = category_encoders.BinaryEncoder(cols=['incdt_date']) data = B_encoder.fit_transform(data) ###### DATA STANDARDISATION ###### # sc = StandardScaler() # data['incdt_time'] = sc.fit_transform(data['incdt_time'].values.reshape(-1,1)) # data['incdt_date'] = sc.fit_transform(data['incdt_date'].values.reshape(-1,1)) ###### ONE HOT ENCODER ###### OH_encoder = OneHotEncoder() hc1 = DataFrame(OH_encoder.fit_transform(data['BORO_NM'].values.reshape(-1,1)).toarray(), columns = ['BORO1', 'BORO2','BORO3', 'BORO4', 'BORO5']) hc2 = DataFrame(OH_encoder.fit_transform(data['VIC_AGE_GROUP'].values.reshape(-1, 1)).toarray(), columns=['VIC_AGE1', 'VIC_AGE2', 'VIC_AGE3', 'VIC_AGE4', 'VIC_AGE5']) hc3 = DataFrame(OH_encoder.fit_transform(data['LOC_OF_OCCUR_DESC'].values.reshape(-1, 1)).toarray(), columns=['LOC_DESC1', 'LOC_DESC2', 'LOC_DESC3', 'LOC_DESC4']) hc4 = DataFrame(OH_encoder.fit_transform(data['VIC_RACE'].values.reshape(-1, 1)).toarray(), columns=['VIC_RACE1', 'VIC_RACE2', 'VIC_RACE3', 'VIC_RACE4']) hc5 = DataFrame(OH_encoder.fit_transform(data['VIC_SEX'].values.reshape(-1, 1)).toarray(), columns=['VICM_SEX1', 'VICM_SEX2']) hc6 = DataFrame(OH_encoder.fit_transform(data['SUSP_RACE'].values.reshape(-1, 1)).toarray(), columns=['SUSP_RACE1', 'SUSP_RACE2', 'SUSP_RACE3', 'SUSP_RACE4']) hc7 = DataFrame(OH_encoder.fit_transform(data['VIC_SEX'].values.reshape(-1, 1)).toarray(), columns=['SUSP_SEX1', 'SUSP_SEX2']) data = pd.concat([data,hc1,hc2,hc3,hc4,hc5,hc6,hc7], axis=1) return data
def binary_encoding(df, cols, handle_nan=True): if handle_nan: encoder = ce.BinaryEncoder(cols=cols, handle_unknown='indicator', handle_missing='indicator') else: encoder = ce.BinaryEncoder(cols=cols, handle_unknown='return_nan', handle_missing='return_nan') df_new = encoder.fit_transform(df) return df_new
def process_data(data_df: pd.DataFrame) -> pd.DataFrame: encoder = ce.BinaryEncoder(cols=['b', 'm', 'd', 'p', 'l', 's'], return_df=True) data_encoded_df = encoder.fit_transform(data_df) print(data_encoded_df) return data_encoded_df
def BinaryEncodingCalc(request, fName): df = get_df(fName) if request.method == 'POST': selected_cols = request.POST.getlist('binaryCol') for selected_col in selected_cols: encoder = ce.BinaryEncoder(cols=[selected_col]) df = encoder.fit_transform(df) df.to_csv(os.path.join(settings.MEDIA_ROOT, 'processed/' + fName + '.csv'), index=False) df_new = get_df(fName) clm_list = list(df_new) NaN_percent = get_NaN_percent(fName) binary_list = [] for clm in clm_list: dt = df_new[clm].dtype if dt == 'int64' or dt == 'float64': pass else: binary_list.append(clm) binaryProcessed_list = selected_cols context = { 'fName': fName, 'processing_list': binary_list, 'processed_list': binaryProcessed_list, 'NaN_percent': NaN_percent, 'status': 'Success', 'message': 'Binary Encoding was done on selected features.' } return render(request, 'BinaryEncoding.html', context)
def create_binary(df, col): assert isinstance(col, str) uniques = pd.DataFrame(df[col].unique(), columns=[col]) enc = ce.BinaryEncoder(verbose=1, cols=[col]) uniques = pd.concat([uniques, enc.fit_transform(uniques)], axis=1) return df.merge(uniques, how='left', on=col)
def apply_binary_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.BinaryEncoder(cols=categorical_columns).fit(df.values) X_transformed = encoder.transform(df) return X_transformed
def encode_categorical(given_dataset): """Street,CentralAir -> binary feature encoding""" """LotShape,LandContour,LandSlope,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual ,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,Electrical ,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual ,GarageCond,PoolQC,Fence-> ordinal feature encoding """ """MSZoning,Alley,Utilities,LotConfig,Neighborhood,Condition1,Condition2,RoofStyle,RoofMatl,Exterior1st ,Exterior2nd,MasVnrType,Heating,Functional,PavedDrive,MiscFeature,SaleType ,SaleCondition -> nominal feature""" encoder1 = ce.BinaryEncoder(cols=['Street','CentralAir'], return_df=True) encoder2 = ce.OrdinalEncoder(cols=['MSZoning','Alley','LandSlope','BldgType','HouseStyle','ExterQual' ,'ExterCond','Foundation','BsmtQual','BsmtCond' ,'BsmtExposure','BsmtFinType1','BsmtFinType2' ,'HeatingQC','Electrical','KitchenQual' ,'FireplaceQu','GarageType','GarageFinish' ,'GarageQual','GarageCond','PoolQC','Fence'], return_df=True) encoder3 = ce.OneHotEncoder(cols=['LotShape','LandContour','Utilities','LotConfig','Neighborhood' ,'Condition1','Condition2','RoofStyle','RoofMatl','Exterior1st' ,'Exterior2nd','MasVnrType','Heating','PavedDrive' ,'MiscFeature','SaleType','SaleCondition','Functional'], return_df=True) # Assume our loan data has been imported as df already given_dataset = encoder1.fit_transform(given_dataset) given_dataset = encoder2.fit_transform(given_dataset) given_dataset = encoder3.fit_transform(given_dataset) return given_dataset
def general_imputer(df): """ general imputer + normalizer and binarizer """ nulls_per_column = df.isnull().sum() print(nulls_per_column.loc[nulls_per_column > 0]) # Create a boolean mask for categorical columns categorical_feature_mask = df.dtypes == object # Get list of categorical column names feat_cat = df.columns[categorical_feature_mask].tolist() # Get list of non-categorical column names feat_cont = df.columns[~categorical_feature_mask].tolist() mapper = DataFrameMapper( [([nf], Pipeline(steps=[('imp', SimpleImputer( strategy="median")), ('std', StandardScaler())])) for nf in feat_cont] + [(cf, Pipeline(steps=[ ('imp', CategoricalImputer()), ('label', ce.BinaryEncoder()), ])) for cf in feat_cat], input_df=True, df_out=True, default=False) return (mapper)
def test_inverse_transform_11(self): """ Test binary encoding """ train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', np.nan] }) test = pd.DataFrame({ 'city': ['chicago', 'paris', 'monaco'], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, 'B'] }) expected = pd.DataFrame({ 'city': ['chicago', 'paris', np.nan], 'state': ['US', 'FR', 'FR'], 'other': ['A', np.nan, 'B'] }) enc = ce.BinaryEncoder(cols=['city', 'state']).fit(train) result = enc.transform(test) original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def featureEncodingUsingBinaryEncoder(dataSetForFeatureEncoding): print("****** Start binary encoding on the categorical features in the given dataset *****") labelName = getLabelName() #Extract the categorical features, leave the label categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object']) #Get the names of the categorical features categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values print("****** Number of features before binary encoding: ",len(dataSetForFeatureEncoding.columns)) print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames)) print("****** Categorical feature names in the dataset: ",categoricalColumnNames) print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') label = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis = 1) for feature in categoricalColumnNames: uniq = np.unique(dataSetForFeatureEncoding[feature]) print('\n{}: {} '.format(feature,len(uniq))) printList(dataSetForFeatureEncoding[feature].unique(),'distinct values') featureColumns = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([feature])].columns, axis = 1) binaryEncoder = ce.BinaryEncoder(cols = [feature]) binaryEncodedFeature = binaryEncoder.fit_transform(featureColumns, label) dataSetForFeatureEncoding = dataSetForFeatureEncoding.join(binaryEncodedFeature) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1) dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1) dataSetForFeatureEncoding[labelName] = label print("****** Number of features after binary encoding: ",len(dataSetForFeatureEncoding.columns)) print("****** End binary encoding on the categorical features in the given dataset *****\n") return dataSetForFeatureEncoding
def processingPreModelling(df, catVarsDict={}): # Data Processing before Modelling # Bools to int df.isMobile = df.isMobile.astype(int) # Date objects to datetime format df.date = pd.to_datetime(df.date, format='%Y-%m-%d') for var, value in catVarsDict.items(): if (value == 'BinaryEncoder'): encoder = category_encoders.BinaryEncoder(cols=[var], drop_invariant=True, return_df=True) df = encoder.fit_transform(df) elif (value == 'LabelEncoder'): df[var] = LabelEncoder().fit_transform(df[var]) elif (value == 'OneHot'): encoder = category_encoders.one_hot.OneHotEncoder( cols=[var], drop_invariant=True, return_df=True, use_cat_names=True) df = encoder.fit_transform(df) return df
def binary_encoding(): source_bin_df = data_df encoder = ce.BinaryEncoder(cols=['decision'], drop_invariant=True) dfb = encoder.fit_transform(source_bin_df['decision']) source_bin_df = pd.concat([source_bin_df, dfb], axis=1) print(source_bin_df)
def encode_vars_via_lookup(fset, feature_lookup): # import category_encoders as ce for var in fset: encoder = None vtype = feature_lookup.get(var, 'numeric') # default numeric if vtype == 'ord': encoder = ce.OrdinalEncoder(cols=[ var, ]) elif vtype == 'cat': encoder = ce.OneHotEncoder(cols=[ var, ]) elif vtype in ('str', 'high_card' ): # high_card: categorical but with high cardinality encoder = ce.BinaryEncoder(cols=[ var, ]) # ... or use ce.HashingEncoder() else: # assuming that the var is numeric pass # data imputation [todo] if encoder is not None: dfX = encoder.fit_transform(dfX, dfy) return dfX
def test_inv_transform_ct_9(self): """ test inv_transform_ct with Binary Encoder and passthrough option """ y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B']}) enc = ColumnTransformer( transformers=[ ('binary', ce.BinaryEncoder(), ['city', 'state']) ], remainder='passthrough') enc.fit(train, y) test = pd.DataFrame({'city': ['chicago', 'chicago', 'paris'], 'state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) expected = pd.DataFrame({'binary_city': ['chicago', 'chicago', 'paris'], 'binary_state': ['US', 'FR', 'FR'], 'other': ['A', 'B', 'C']}) result = pd.DataFrame(enc.transform(test)) result.columns = ['col1_0', 'col1_1', 'col2_0', 'col2_1', 'other'] original = inverse_transform(result, enc) pd.testing.assert_frame_equal(original, expected)
def dicho_nominales(data, cols=['var1', 'var2']): ''' # Dichotomise les variables catégorielles nominales ''' enc = ce.BinaryEncoder(cols=cols) data_drop = enc.fit_transform(data) return (data_drop)
def binary(): X, _, _ = get_mushroom_data() print(X.info()) enc = ce.BinaryEncoder() enc.fit(X, None) out = enc.transform(X) print(out.info()) del enc, _, X, out
def fit(self, X, y=None): # we need the fit statement to accomodate the sklearn pipeline des_nom_DF = X[self.descreteVars_Nominal] # Map Nominal Categorical data to Numerical cat_nom_DF = des_nom_DF.fillna('NULL').astype(str) self.ce_binary = ce.BinaryEncoder() self.ce_binary.fit(cat_nom_DF) return self
def test_inverse_transform_ce_binary(self): """ Unit test inverse transform ce binary """ preprocessing = ce.BinaryEncoder(cols=['Age', 'Sex'], return_df=True) fitted_dataset = preprocessing.fit_transform(self.ds_titanic_clean) output = inverse_transform(fitted_dataset, preprocessing) pd.testing.assert_frame_equal(output, self.ds_titanic_clean)
def encode_categorical_features(df, encoder=None): ''' Encode the categorical features using BinaryEncoder. ''' if encoder == None: encoder = ce.BinaryEncoder( cols=[location_key, class_key, subdepartment_key]) encoder.fit(df) return encoder.transform(df), encoder
def create_features(self, df_train, df_test): encoder = ce.BinaryEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_BinaryEncoder'] = encoded_train[column] self.test[column + '_BinaryEncoder'] = encoded_test[column]
def load_and_clean(non_categorical, categorical, data_path="../data/with_stock_data_webclicks_linkedin.csv", normalize=False, binary_encode=False, trend_features=True, filter=False): frame = pd.read_csv(data_path) all_non_categorical = get_all_non_categorical() all_categorical = get_all_categorical() for non_c in non_categorical: if non_c not in all_non_categorical: raise ValueError("Non categorical {} is not valid".format(non_c)) for c in categorical: if c not in all_categorical: raise ValueError("Categorical {} is not valid".format(non_c)) X = pd.DataFrame({key: frame[key] for key in non_categorical}) if trend_features: trends_data = google_trends_features() X = pd.concat([X, google_trends_features()], axis=1) y = frame[[Y]] for cat in categorical: new_cols = None if binary_encode: ce_binary = ce.BinaryEncoder(cols=[cat]) new_cols = ce_binary.fit_transform(frame[cat]) else: new_cols = pd.get_dummies(frame[cat], prefix='category') X = pd.concat([X, new_cols], axis=1) X = clean_data(X) remove_nan(X, non_categorical) if filter: indices = X["current employee estimate"] > 300 X = X[indices] y = y[indices] if (normalize): X.loc[:, :] = preprocessing.StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=.5, random_state=1) return (X_train, y_train, X_dev, y_dev, X_test, y_test)
def encode_categorical(df,encoder_name='binary'): encoder_dic = {"one_hot":ce.OneHotEncoder(), "feature_hashing":ce.HashingEncoder(n_components=32), "binary":ce.BinaryEncoder(), "ordinal":ce.OrdinalEncoder(), "polynomial":ce.PolynomialEncoder()} encoder = encoder_dic.get(encoder_name) encoder.fit(df,verbose=1) df = encoder.transform(df) return df
def main(params, inputs, outputs): columns_param = params.columns data = inputs.data data_new = outputs.data_new data_0 = pd.read_pickle(data) encoder = ce.BinaryEncoder(cols=[col for col in columns_param.split(",")]) data_1 = encoder.fit_transform(data_0) data_1.to_pickle(data_new)
def test_binary_np(self): """ :return: """ X = self.create_array(n_rows=1000) X_t = self.create_array(n_rows=100) enc = encoders.BinaryEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t))
def categorical_cols_train(data): try: print('Categorical Encoding') encoder = ce.BinaryEncoder(cols=[ 'employment_type', 'required_experience', 'required_education', 'country' ]) newdata = encoder.fit_transform(data) pickle.dump(encoder, open("model/encoder.p", "wb")) return newdata except Exception as e: handle('categorical column handling')
def preprocess(self) -> DataFrame: X_train = self.transaction_train_df_raw.drop('isFraud', axis=1) # 506691 # num_test = 0.20 # X_all = transaction_train_df_raw.drop('isFraud', axis=1) # Y_all = transaction_train_df_raw['isFraud'] # X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=num_test) # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='mean') # Preprocessing for categorical data # to implment addr2, divide into 2 categories, then one hot enconding onehot_categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer( strategy='most_frequent')), ('onehot', ce.OneHotEncoder())]) binary_categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('binary', ce.BinaryEncoder(drop_invariant=False))]) ordinary_categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer( strategy='mean')), ('ordinary', ce.OrdinalEncoder())]) # Bundle preprocessing for numerical and categorical data preprocessor_pipeline = ColumnTransformer(transformers=[ ('num', numerical_transformer, [ 'TransactionDT', 'TransactionAmt', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D4', 'D10', 'D15' ]), ('onehot_cat', onehot_categorical_transformer, ['ProductCD', 'card4', 'M6']), # ('onehot_cat', onehot_categorical_transformer, ['ProductCD', 'card4', 'card6', 'M6']), # binary encoder did not work, should re implement # ('binary_cat', binary_categorical_transformer, ['card3', 'card5', 'addr1', 'addr2']), ('binary_cat', binary_categorical_transformer, ['P_emaildomain']), ('ordinary_cat', ordinary_categorical_transformer, ['card1', 'card2']) ]) # dataFrameMapper = DataFrameMapper([ # (['TransactionDT', 'TransactionAmt', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D4', 'D10', 'D15'], numerical_transformer), # (['ProductCD', 'card4', 'M6'], onehot_categorical_transformer), # # ('onehot_cat', onehot_categorical_transformer, ['ProductCD', 'card4', 'card6', 'M6']), # # binary encoder did not work, should re implement # # ('binary_cat', binary_categorical_transformer, ['card3', 'card5', 'addr1', 'addr2']), # (['P_emaildomain'], binary_categorical_transformer ), # (['card1', 'card2'], ordinary_categorical_transformer) # ] # ) return preprocessor_pipeline.fit_transform(X_train)
def test_binary(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) enc = encoders.BinaryEncoder(verbose=1, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BinaryEncoder(verbose=1, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray)) # test inverse_transform X = self.create_dataset(n_rows=1000, has_none=False) X_t = self.create_dataset(n_rows=100, has_none=False) X_t_extra = self.create_dataset(n_rows=100, extras=True, has_none=False) enc = encoders.BinaryEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) self.verify_inverse_transform( X_t, enc.inverse_transform(enc.transform(X_t))) with self.assertRaises(ValueError): out = enc.inverse_transform(enc.transform(X_t_extra))
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def main(): cat_columns = [GENDER, HAIR_COLOR, WEARS_GLASSES, PROFESSION, COUNTRY] training = read_data( "tcd ml 2019-20 income prediction training (with labels).csv") training = preprocess_data(training, True) # print(training.head()) testing = read_data( "tcd ml 2019-20 income prediction test (without labels).csv") testing = preprocess_data(testing, False) training = pd.get_dummies(training, columns=[GENDER, HAIR_COLOR, WEARS_GLASSES]) ce_bin = ce.BinaryEncoder(cols=[PROFESSION, COUNTRY]) training = ce_bin.fit_transform(training) cat_dummies = [ col for col in training if "_" in col and col.split("_")[0] in cat_columns ] # print(training.dtypes) # print(cat_dummies) processed_cols = list(training.columns[:]) # testing = pd.get_dummies(testing, columns=cat_columns) testing = pd.get_dummies(testing, columns=[GENDER, HAIR_COLOR, WEARS_GLASSES]) testing = ce_bin.fit_transform(testing) for col in testing.columns: if ("_" in col) and (col.split("_")[0] in cat_columns) and col not in cat_dummies: print("Removing additional feature {}".format(col)) testing.drop(col, axis=1, inplace=True) for col in cat_dummies: if col not in testing.columns: print("Adding missing feature {}".format(col)) testing[col] = 0 X = training.loc[:, training.columns != INCOME] X = X.loc[:, X.columns != "Instance"] y = training[INCOME] process_data(testing, training)