def target_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_col: List[str], target_col: str, cv) -> Tuple[pd.DataFrame, pd.DataFrame]: """Target encoding Args: train (pd.DataFrame): train test (pd.DataFrame): test encode_col (List[str]): encoded columns target_col (str): target column cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object Returns: Tuple[pd.DataFrame, pd.DataFrame]: train, test """ warnings.simplefilter('ignore') te = TargetEncoder(cv=cv) train_fe = te.fit_transform(train[encode_col], train[target_col]) train_fe.columns = ['te_' + c for c in train_fe.columns] train = pd.concat([train, train_fe], axis=1) test_fe = te.transform(test[encode_col]) test_fe.columns = ['te_' + c for c in test_fe.columns] test = pd.concat([test, test_fe], axis=1) return train, test
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) y = trn[TARGET_COL] n_trn = trn.shape[0] trn.drop(TARGET_COL, axis=1, inplace=True) logging.info('categorical: {trn.shape[1]})') df = pd.concat([trn, tst], axis=0) logging.info('label encoding categorical variables') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) te = TargetEncoder(cv=cv) te.fit(trn, y) df = te.transform(df) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y.values, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def target_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_col, target_col, cv): warnings.simplefilter('ignore') te = TargetEncoder(cv=cv) train_fe = te.fit_transform(train[encode_col], train[target_col]) train_fe.columns = ['te_' + c for c in train_fe.columns] test_fe = te.transform(test[encode_col]) test_fe.columns = ['te_' + c for c in test_fe.columns] return pd.concat([train_fe, test_fe]).reset_index(drop=True)
def target_encoding(train: pd.DataFrame, test: pd.DataFrame, col_definition: dict, option: dict): """ col_definition: encode_col, target_col option: cv """ warnings.simplefilter('ignore') te = TargetEncoder(cv=option['cv']) train_fe = te.fit_transform(train[col_definition['encode_col']], train[col_definition['target_col']]) train_fe.columns = ['te_' + c for c in train_fe.columns] train = pd.concat([train, train_fe], axis=1) test_fe = te.transform(test[col_definition['encode_col']]) test_fe.columns = ['te_' + c for c in test_fe.columns] test = pd.concat([test, test_fe], axis=1) return train, test
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) y = trn[TARGET_COL] n_trn = trn.shape[0] features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]] logging.info('target encoding') cv = StratifiedKFold(n_splits=N_FOLD, random_state=SEED) te = TargetEncoder(cv=cv) trn[features] = te.fit_transform(trn[features], y) tst[features] = te.transform(tst[features]) with open(feature_map_file, 'w') as f: for i, col in enumerate(features): f.write('{}\t{}\tint\n'.format(i, col)) logging.info('saving features') save_data(trn[features].values, y.values, train_feature_file) save_data(tst[features].values, None, test_feature_file)
def generate_feature(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): logging.info('loading raw data') trn = pd.read_csv(train_file, index_col=ID_COL) tst = pd.read_csv(test_file, index_col=ID_COL) logging.info('label encoding categorical variables') y = trn.loc[:, TARGET_COL] n_trn = trn.shape[0] trn = trn.drop(TARGET_COL, axis=1) df = pd.concat([trn, tst], axis=0) # build features features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4'] features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'] features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5'] features_cyc = ['day', 'month'] logging.info("Dummy encode: bin 0 to 4") # convert bins 0, 1, 2 to object so that # get_dummies recognizes them and creates missing indicators bin_012 = ['bin_0', 'bin_1', 'bin_2'] df[bin_012] = df[bin_012].astype(object) dummies = pd.get_dummies(df[features_bin], dummy_na=True) df = df.drop(features_bin, axis=1) df = pd.concat([df, dummies], axis=1) logging.info("Target encoding: nom 0 to 9 and cyclical features") target_enc_cols = features_ord + features_cat + features_hex + features_cyc cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) te = TargetEncoder(cv=cv) te.fit(trn.loc[:, target_enc_cols], y) df.loc[:, target_enc_cols] = te.transform(df.loc[:, target_enc_cols]) # logging.info("Label encode ordinals: ord 0 to 5") # map_ord_0 = None # already a numeric column # map_ord_1 = {'Novice': 1, 'Contributor': 2, # 'Expert': 3, 'Master': 4, 'Grandmaster': 5} # map_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3, # 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6} # map_ord_3 = dict(zip(df['ord_3'].value_counts().sort_index().keys(), # range(1, len(df['ord_3'].value_counts()) + 1))) # map_ord_4 = dict(zip(df['ord_4'].value_counts().sort_index().keys(), # range(1, len(df['ord_4'].value_counts()) + 1))) # # temp_ord_5 = pd.DataFrame( # df['ord_5'].value_counts().sort_index().keys(), columns=['ord_5']) # temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper() # temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper() # temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4) # temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4) # temp_ord_5['Add'] = temp_ord_5['First'] + temp_ord_5['Second'] # temp_ord_5['Mul'] = temp_ord_5['First'] * temp_ord_5['Second'] # map_ord_5 = dict(zip(temp_ord_5['ord_5'], # temp_ord_5['Mul'])) # # maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5] # for i, m in zip(range(0, 6), maps): # if i != 0: # df[f'ord_{i}'] = df[f'ord_{i}'].map(m) # df[f'ord_{i}'] = (df[f'ord_{i}'].fillna(df[f'ord_{i}'].median())) # logging.info("cyclical features") # df[features_cyc] = df[features_cyc].astype(object) # dummies_cyc = pd.get_dummies(df[features_cyc], dummy_na=True) # df = df.drop(features_cyc, axis=1) # df = pd.concat([df, dummies_cyc], axis=1) with open(feature_map_file, 'w') as f: for i, col in enumerate(df.columns): f.write('{}\t{}\tq\n'.format(i, col)) logging.info('saving features') save_data(df.values[:n_trn, ], y.values, train_feature_file) save_data(df.values[n_trn:, ], None, test_feature_file)
def test_TargetEncoder(generate_data): df = generate_data() feature_cols = [x for x in df.columns if x != TARGET_COL] cat_cols = [x for x in feature_cols if df[x].nunique() < 100] te = TargetEncoder() X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL]) print('Without CV:\n{}'.format(X_cat.head())) assert X_cat.shape[1] == len(cat_cols) cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED) te = TargetEncoder(cv=cv) X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL]) print('With CV (fit_transform()):\n{}'.format(X_cat.head())) assert X_cat.shape[1] == len(cat_cols) te = TargetEncoder(cv=cv) te.fit(df[cat_cols], df[TARGET_COL]) X_cat = te.transform(df[cat_cols]) print('With CV (fit() and transform() separately):\n{}'.format( X_cat.head())) assert X_cat.shape[1] == len(cat_cols)
def test_TargetEncoder(): df = pd.DataFrame(np.random.randint(0, N_CATEGORY, size=(N_OBS, N_FEATURE)), columns=['c{}'.format(x) for x in range(N_FEATURE)]) feature_cols = df.columns df[TARGET_COL] = np.random.rand(N_OBS) te = TargetEncoder() X_cat = te.fit_transform(df[feature_cols], df[TARGET_COL]) print('Without CV:\n{}'.format(X_cat.head())) assert X_cat.shape[1] == len(feature_cols) cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED) te = TargetEncoder(cv=cv) X_cat = te.fit_transform(df[feature_cols], df[TARGET_COL]) print('With CV (fit_transform()):\n{}'.format(X_cat.head())) assert X_cat.shape[1] == len(feature_cols) te = TargetEncoder(cv=cv) te.fit(df[feature_cols], df[TARGET_COL]) X_cat = te.transform(df[feature_cols]) print('With CV (fit() and transform() separately):\n{}'.format( X_cat.head())) assert X_cat.shape[1] == len(feature_cols)