def target_encode(): from category_encoders.target_encoder import TargetEncoder tr = pd.read_csv('./data/tr.csv') te = pd.read_csv('./data/te.csv') y = tr['TARGET'].astype(int) tr.drop(['TARGET'], axis=1, inplace=True) encode_model = TargetEncoder(verbose=1, min_samples_leaf=100) cate_col = [] for col in tr.columns: if tr[col].dtype == 'object': cate_col.append(col) encode_model.fit(tr, y) tr = encode_model.transform(tr) te = encode_model.transform(te) tr = tr[cate_col] te = te[cate_col] tr.columns = ['TE_' + col for col in cate_col] te.columns = ['TE_' + col for col in cate_col] print(tr.info()) print(te.info()) tr.to_csv("./data/target_tr.csv", index=False) te.to_csv("./data/target_te.csv", index=False)
def fit(self, X_df, y=None): def regroup_cat(X, liste): if X not in liste: return ('other') else: return (X) self.prop_to_keep = [ 'Apartment', 'Serviced apartment', 'Condominium', 'Loft' ] self.prop_transformer = TargetEncoder() self.prop_transformer.fit( X_df['property_type'].apply( lambda x: regroup_cat(x, self.prop_to_keep)), y) self.pol_to_keep = [ 'flexible', 'strict_14_with_grace_period', 'moderate', 'moderate_new' ] self.pol_transformer = TargetEncoder() self.pol_transformer.fit( X_df['cancellation_policy'].apply( lambda x: regroup_cat(x, self.pol_to_keep)), y) self.room_transformer = OrdinalEncoder() self.room_transformer.fit(X_df['room_type']) self.city_transformer = OneHotEncoder(handle_unknown='ignore') self.city_transformer.fit(pd.DataFrame(X_df['city_origin'])) # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))]) return self
def target_encoder(params): train = params[0].astype('str') test = params[1].astype('str') target = params[2] te = TargetEncoder(return_df=False) train = te.fit_transform(train.reshape(-1, 1), target.reshape(-1, 1)) test = te.transform(test.reshape(-1, 1)) return train.flatten(), test.flatten()
def fit_target_encoder(train_imputed_categorical_df: pd.DataFrame, train_transformed_target: pd.DataFrame): target_encoder = TargetEncoder( cols=train_imputed_categorical_df.columns.values) target_encoder.fit(X=train_imputed_categorical_df, y=train_transformed_target) return target_encoder
def label_encoding_fit(X, y, cols): '''Label - Takes X_train, y_train, columns to be encoded, saves encoded files ''' for col in cols: print("Encoding for column: {}".format(col)) encoder = TargetEncoder(cols=[col]) encoder.fit(X[col], y) write_encoder(encoder, 'label', col) return
def feature_importance(url, dataloaded, rows): # If dataset is not loaded if dataloaded is None: return [], "No file" # Get dataset if pickle exists data_id = int(re.search(r"data/(\d+)", url).group(1)) try: df = pd.read_pickle("cache/df" + str(data_id) + ".pkl") except OSError: return [], "No file" # Get table of metadata meta_data = pd.DataFrame(rows) try: target_attribute = meta_data[meta_data["Target"] == "true"][ "Attribute" ].values[0] target_type = meta_data[meta_data["Target"] == "true"]["DataType"].values[0] except IndexError: return "No target found", "No target found" # Feature importance bar plot from category_encoders.target_encoder import TargetEncoder x = df.drop(target_attribute, axis=1) y = df[target_attribute] te = TargetEncoder() if target_type == "nominal" or target_type == "string": y = pd.Categorical(y).codes x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestClassifier(n_estimators=10, n_jobs=-1) rf.fit(x, y) else: x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestRegressor(n_estimators=10, n_jobs=-1) rf.fit(x, y) fi = pd.DataFrame( rf.feature_importances_, index=x.columns, columns=["importance"] ) fi = fi.sort_values("importance", ascending=False).reset_index() trace = go.Bar(y=fi["index"], x=fi["importance"], name="fi", orientation="h") layout = go.Layout( autosize=False, margin={"l": 100, "t": 0}, height=500, hovermode="closest" ) figure = go.Figure(data=[trace], layout=layout) fi.to_pickle("cache/fi" + str(data_id) + ".pkl") return html.Div(dcc.Graph(figure=figure), className="twelve columns"), "done"
def target_encoder(cols, train_set, train_y, test_set): # handle_unknown 和 handle_missing 被设定为 'value' # 在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定 # 两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值 encoder = TargetEncoder(cols=cols, handle_unknown='value', handle_missing='value').fit(train_set, train_y) encoded_train = encoder.transform(train_set) # 转换训练集 encoded_test = encoder.transform(test_set) # 转换测试集 return encoded_train, encoded_test
def encode_features(features, labels): """Encode categorical features with TargetEncoder""" features_columns = features.columns.values.tolist() start_time = time.time() enc = TargetEncoder(cols=features_columns, return_df=True).fit(features, labels) encoded_features = enc.transform(features) print("--- %s seconds ---" % (time.time() - start_time)) return encoded_features
class target_enc(BaseEstimator, TransformerMixin): def __init__(self, columns): self.columns = columns def fit(self, df, y=None): self.encoder = TargetEncoder(handle_unknown='value', cols=self.columns) self.encoder = self.encoder.fit(df, y) return self def transform(self, df, y=None): df_ = df.copy() return self.encoder.transform(df_, y)
def feature_importance(url, tab3, rows): data_id = int(re.search('data/(\d+)', url).group(1)) try: df = pd.read_pickle('cache/df' + str(data_id) + '.pkl') except OSError: return [], "No file" meta_data = pd.DataFrame(rows) try: target_attribute = meta_data[meta_data["Target"] == "true"]["Attribute"].values[0] target_type = ( meta_data[meta_data["Target"] == "true"]["DataType"].values[0]) except IndexError: return "No target found", "No target found" # Feature importance bar plot from category_encoders.target_encoder import TargetEncoder x = df.drop(target_attribute, axis=1) y = df[target_attribute] te = TargetEncoder() if target_type == "nominal" or target_type == "string": y = pd.Categorical(y).codes x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestClassifier(n_estimators=10, n_jobs=-1) rf.fit(x, y) else: x = clean_dataset(x) x = te.fit_transform(x, y) rf = RandomForestRegressor(n_estimators=10, n_jobs=-1) rf.fit(x, y) fi = pd.DataFrame(rf.feature_importances_, index=x.columns, columns=['importance']) fi = fi.sort_values('importance', ascending=False).reset_index() trace = go.Bar(y=fi['index'], x=fi['importance'], name='fi', orientation='h') layout = go.Layout(autosize=False, margin=dict(l=100), width=800, height=500, hovermode='closest') figure = go.Figure(data=[trace], layout=layout) fi.to_pickle('cache/fi' + str(data_id) + '.pkl') return html.Div(dcc.Graph(figure=figure)), "done"
def target_encode(data, label, encoder=None): """ :param data: :param label: :param encoder: if supplied the encoder will be used to predict onto data :return: """ if encoder is None: encoder = TargetEncoder() data = encoder.fit_transform(data, label) return encoder, data else: return encoder, encoder.transform(data, label)
def create_features(self): _df = pd.concat([train, test]).reset_index(drop=True) _df = TargetEncoder(smoothing=0.1).fit_transform( _df['dating_period'].astype(object), np.log1p(_df['likes']) ).rename(columns={'dating_period':'targetencoding_dating_period'}) self.train = _df[:len(train)] self.test = _df[len(train):].reset_index(drop=True)
def target_encoder(cols, train_set, train_y, test_set): """ 特征无内在顺序,category数量 > 4 Target encoding 采用 target mean value (among each category) 来给categorical feature做编码。 handle_unknown 和 handle_missing 被设定为 'value' 在目标编码中,handle_unknown 和 handle_missing 仅接受 ‘error’, ‘return_nan’ 及 ‘value’ 设定 两者的默认值均为 ‘value’, 即对未知类别或缺失值填充训练集的因变量平均值 """ encoder = TargetEncoder(cols=cols, handle_unknown='value', handle_missing='value').fit( train_set, train_y) encoded_train = encoder.transform(train_set) # 转换训练集 encoded_test = encoder.transform(test_set) # 转换测试集 return encoded_train, encoded_test
def onehot_or_targ(X, y, categorical, k): ''' Returns the X, y with encoded categorical variables based on a threshold value k. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. categorical: list Contains the names of the categorical columns. k: int Contains threshold value to determine whether to perform target encoding or one-hot encoding. Returns: -------- pd.DataFrame, pd.Series Contains an updated pd.DataFrame with encoding of categorical features, contains an updated pd.Series with encoding of a categorical target. ''' for column in categorical: if len(X[column].unique()) > k: if X[column].dtype.name == 'category': X[column] = X[column].cat.codes if y.dtype.name == 'category': y = y.cat.codes X = TargetEncoder(cols=[column]).fit_transform(X, y) else: X = OneHotEncoder(cols=[column]).fit_transform(X) return X, y
def create_features(self): _df = technique_df.merge(train[['object_id', 'likes']], on='object_id', how='left') group = pd.concat([ _df['object_id'], TargetEncoder(smoothing=0.1).fit_transform(_df['name'], np.log1p(_df['likes'])), ], axis=1).groupby('object_id').mean().rename(columns={'name':'targetencoding_technique'}) self.train = train[['object_id']].merge(group, on='object_id', how='left').drop(columns='object_id', axis=1) self.test = test[['object_id']].merge(group, on='object_id', how='left').drop(columns='object_id', axis=1)
def _encode(): train = pd.read_feather('./data/application_train.preprocessed.feather') test = pd.read_feather('./data/application_test.preprocessed.feather') df = pd.concat([train, test], sort=False).reset_index(drop=True) cols = [ 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', # Level of highest education the client achieved, # noqa 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'NAME_CONTRACT_TYPE', # Identification if loan is cash or revolving, ] encoder = TargetEncoder(cols=cols) encoder.fit(df[cols], df['TARGET']) res = encoder.transform(df[cols]) res.columns = ['{}_ENC'.format(c) for c in res.columns] res['SK_ID_CURR'] = df['SK_ID_CURR'] res.to_feather('./data/app.enc.feather')
def target_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. The value must be strictly bigger than 0. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") min_samples_leaf = set_default_vale("min_samples_leaf", configger, 1) smoothing = set_default_vale("smoothing", configger, 1.0) encoder = TargetEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, handle_missing=handle_missing, handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing) res = encoder.fit_transform(X, y) return res
def _feature_encode(self, data): dummy_cols = [] for col in data.cat_features: # merge categorical features with low frequencies if data.train_df[col].nunique() / len(data.train_df[col]) < 0.1: for name, count in data.train_df[col].value_counts().items(): if count / len(data.train_df[col]) < 0.01: data.train_df[col].replace(name, 'Rare', inplace=True) if data.test_df[col].nunique() / len(data.test_df[col]) < 0.1: for name, count in data.test_df[col].value_counts().items(): if count / len(data.test_df[col]) < 0.01: data.test_df[col].replace(name, 'Rare', inplace=True) # target-encode categorical features with high number of unique values if data.train_df[col].nunique() > 10: from category_encoders.target_encoder import TargetEncoder encoder = TargetEncoder(cols=col) encoder.fit(data.train_df[col], data.train_df[data.target_var]) data.train_df[col] = encoder.transform(data.train_df[col]) data.test_df[col] = encoder.transform(data.test_df[col]) else: dummy_cols.append(col) # create dummy variables from categorical features with low number of unique values data.train_df = pd.get_dummies(data.train_df, columns=dummy_cols, drop_first=True) data.test_df = pd.get_dummies(data.test_df, columns=dummy_cols, drop_first=True) data.target_df = data.train_df[data.target_var]
def target_encode_Stores(df, enc=None): """Target encode the Store variable using the category_encoders module Args: df: Data enc: Existing Encoder / if None retrain new encoder """ target = df['Sales'].values stores = df['Store'].astype(str) if not enc: print("Fit TargetEncoder...") enc = TargetEncoder() new_store = enc.fit_transform(stores, target) else: print("Transform using existing TargetEncoder...") new_store = enc.transform(stores, target) df.loc[:, 'Store'] = new_store return new_store, enc
def _get_single_encoder(encoder_name: str, cols, smoothing): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cols, smoothing=smoothing) #cols, smoothing else: raise ValueError('NO ENCODER FOUND') return encoder
def target_encode_custom(df: pd.DataFrame, name: str, enc=None): """Target encode the Store variable using the category_encoders module Args: df: Data name (str): name of the column to encode enc: Existing Encoder / if None retrain new encoder """ target = df['Sales'].values stores = df[name].astype(str) if not enc: print("Fit TargetEncoder...") enc = TargetEncoder() new_store = enc.fit_transform(stores, target) else: print("Transform using existing TargetEncoder...") new_store = enc.transform(stores, target) df.loc[:, name] = new_store return new_store, enc
def fit(self, x=None, y=None): self.params = x.columns self.kmn_mod = {} self.trg_mod = {} for col in x.columns: tmp = pd.DataFrame([]) self.kmn_mod[col] = KMeans(n_clusters=self.n_clust[col]) self.kmn_mod[col].fit(np.reshape(x[col].values, (-1, 1))) tmp[col] = self.kmn_mod[col].predict( np.reshape(x[col].values, (-1, 1))) self.trg_mod[col] = TargetEncoder() self.trg_mod[col].fit(tmp[col].astype("category"), train_y) return self
class DateTransformer(BaseEstimator, TransformerMixin): """Transforms DATE using target encoding of MONTH.""" def __init__(self): self.encoder = None self.month = None def fit(self, X, y=None): self.month = X.apply(lambda x: x.apply(lambda y: str(y.month))) self.encoder = TargetEncoder().fit(self.month, y) return self def transform(self, X, y=None): months = X.apply(lambda x: x.apply(lambda y: str(y.month))) target_encoded_month = self.encoder.transform(months) return target_encoded_month
def get_single_encoder(encoder_name: str, cat_cols: list): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "OneHotEncoder": encoder = OneHotEncoder(cols=cat_cols) if encoder is None: raise NotImplementedError("To be implemented") return encoder
def get_single_encoder(encoder_name: str, cat_cols: list): if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == 'OneHotEncoder': encoder = OneHotEncoder(cols=cat_cols) # assert encoder is not None return encoder
def fit(self, input_df): df = pd.read_csv(INPUT_PATH + f'{self.data_name}.csv') train = pd.read_csv(INPUT_PATH + 'train.csv') vc = df['name'].value_counts() idx = vc[vc >= 10].index df = df[df['name'].isin(idx)] df = df.merge(train[['object_id', 'likes']], on='object_id', how='left') self.meta = pd.concat([ df['object_id'], TargetEncoder(smoothing=0.1).fit_transform(df['name'], np.log1p(df['likes'])) ], axis=1).groupby('object_id').agg( ['mean', 'sum', 'max', 'min', 'std']) self.meta.columns = self.meta.columns.droplevel(0) return self.transform(input_df)
def target_encode(X, X_test, cols, y): te = TargetEncoder(cols=cols, return_df=True) X = te.fit_transform(X, y) X_test = te.transform(X_test) return (X, X_test)
,min_child_samples=200 ,colsample_bytree=.2 ,reg_alpha=.1 ,reg_lambda=.1 ) return lgbr # 本地验证 kf = KFold(n_splits=10, shuffle=True, random_state=100) devscore = [] for tidx, didx in kf.split(train.index): tf = train.iloc[tidx] df = train.iloc[didx] tt = target.iloc[tidx] dt = target.iloc[didx] te = TargetEncoder(cols=tecols) tf = te.fit_transform(tf, tt) df = te.transform(df) lgbr = makelgb() lgbr.fit(tf, tt) pre = lgbr.predict(df) fpr, tpr, thresholds = roc_curve(dt, pre) score = auc(fpr, tpr) devscore.append(score) print(np.mean(devscore)) # # 在整个train集上重新训练,预测test,输出结果 # lgbr = makelgb() # te = TargetEncoder(cols=tecols) # tf = te.fit_transform(train, target) # df = te.transform(test)
def lin_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) Drops NaN from training data, Replaces NaN in test data with ffill, target-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, selects features using RFECV, with a lasso mode, cv set to 5, uses KNeighborRegressor for 11 nearest neighbours weighted to distance """ print("cleaning data...") clean_labelled = labelled_data.dropna() clean_unlabelled = unlabelled_data[all_columns] # not ideal but fillna the mean freezes for some reason clean_unlabelled = clean_unlabelled.fillna(method="ffill") # clean_unlabelled = clean_unlabelled.fillna("None") # remove some columns # clean_labelled = drop_columns(clean_labelled) # clean_unlabelled = drop_columns(clean_unlabelled) # print("one hot encoding data...") # One hot encoding # ohe = OneHotEncoder( # categories="auto", # handle_unknown="ignore", # sparse=False # ) # clean_labelled = encode_training(ohe, clean_labelled) # clean_unlabelled = encode_testing(ohe, clean_unlabelled) clean_labelled = constrain_col_vals(clean_labelled) clean_unlabelled = constrain_col_vals(clean_unlabelled) unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("target encoding data...") # Target encoding tar_encode = TargetEncoder() train_data = tar_encode.fit_transform(train_data, train_target) test_data = tar_encode.transform(test_data) unknown_data = tar_encode.transform(unknown_data) print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("selecting features...") # feature selection lasso = lm.Lasso() selector = RFECV(lasso, cv=5) train_data = selector.fit_transform(train_data, train_target) test_data = selector.transform(test_data) unknown_data = selector.transform(unknown_data) print("fitting model...") # fit model # lasso = lm.LassoCV(cv=5) # lasso.fit(train_data, train_target) neigh = KNeighborsRegressor( n_neighbors=11, weights="distance" ) neigh.fit(train_data, train_target) print("analysing test results...") # validate test test_result = neigh.predict(test_data) error = np.sqrt(mean_squared_error(test_target, test_result)) variance = explained_variance_score(test_target, test_result) print("Root mean squared error of test data: ", error) print("Variance: ", variance) print("predicting unknown data...") # predict and format values = neigh.predict(unknown_data) results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Income": values.flatten() }) print("Finished.") return results
# Deal with unknown values DATA.replace("?", np.NaN, inplace=True) DATA["collision_type"].fillna(DATA["collision_type"].mode()[0], inplace=True) DATA["property_damage"].fillna(False, inplace=True) DATA["police_report_available"].fillna(False, inplace=True) # Replace strings with True/False values DATA = DATA.replace(("YES", "Y", "NO", "N"), (True, True, False, False)) # Seperate the data into features and labels FEATURES, LABELS = DATA.drop(["fraud_reported"], axis=1), DATA["fraud_reported"] # Use target encoding with smoothing for categorical features (strings) FEATURES = TargetEncoder().fit(FEATURES, LABELS).transform(FEATURES, LABELS) # Use SMOTE oversampling with ENN undersampling to balance the dataset FEATURES, LABELS = SMOTEENN().fit_sample(FEATURES, LABELS.values.ravel()) # Split the dataset into test and train datasets TRAIN_FEATURES, TEST_FEATURES, TRAIN_LABELS, TEST_LABELS = train_test_split( FEATURES, LABELS) # Create hyperparameter combinations to test using cross validation N_ESTIMATORS_PARAMS = [300, 500, 700, 900, 1100] CRITERION_PARAMS = ["gini", "entropy"] COMBOS = get_combos(N_ESTIMATORS_PARAMS, CRITERION_PARAMS) SCORES = [] # Create a classifier with each combination of hyperparameters and measure its