def make_cat_features(df, filekey): mkdir_func(f'../features/{filekey}') train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) #======================================================================== # Categorical Feature Encode #======================================================================== # Factorize logger.info("Factorize Start!!") for col in categorical_features: for col in categorical_features: train[f"lbl_{col}@"], indexer = pd.factorize(train[col]) test[f"lbl_{col}@"] = indexer.get_indexer(test[col]) # Count Encoding logger.info("Count Encoding Start!!") for col in categorical_features: train = cnt_encoding(train, col, ignore_list=ignore_list) test = cnt_encoding(test, col, ignore_list=ignore_list) #======================================================================== # Categorical Feature Save #======================================================================== for col in train.columns: logger.info("Saving Features...") if col.count('@'): result_train = train[col].values result_test = test[col].values logger.info(f"COL: {col} | LENGTH: {len(result_train)}") utils.to_pkl_gzip(obj=result_train, path=f'../features/{filekey}/train_{col}') utils.to_pkl_gzip(obj=result_test, path=f'../features/{filekey}/test_{col}')
def clean_app(app): logger.info(f''' #============================================================================== # APPLICATION CLEANSING #==============================================================================''' ) revo = 'Revolving loans' drop_list = [ col for col in app.columns if col.count('is_train') or col.count('is_test') or col.count('valid_no') ] app.drop(drop_list, axis=1, inplace=True) app['AMT_INCOME_TOTAL'] = app['AMT_INCOME_TOTAL'].where( app['AMT_INCOME_TOTAL'] < 1000000, 1000000) app['CODE_GENDER'].replace('XNA', 'F', inplace=True) cat_cols = get_categorical_features(data=app, ignore=[]) for col in cat_cols: app[col].fillna('XNA', inplace=True) ' revo ' amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE'] for col in amt_list: app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE'] == revo, np.nan) utils.to_df_pickle(df=app, path='../input', fname='clean_application_train_test')
def make_num_features(df, filekey): mkdir_func(f'../features/{filekey}') # if filekey.count('bur'): df = interact_feature(df, filekey) #======================================================================== # カテゴリの内容別にNumeric Featureを切り出す #======================================================================== num_list = get_numeric_features(df=df, ignore_list=ignore_list) cat_list = get_categorical_features(df=df, ignore_list=[]) # few_list = [] # for cat in tqdm(cat_list): # for val in tqdm(df[cat].drop_duplicates()): # length = len(df[df[cat]==val]) # if length < len(df)*0.002: # few_list.append(val) # continue # for num in num_list: # # pararell_process(, num_list) # df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan) # df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan) logger.info(f'{fname} SET SHAPE : {df.shape}') #======================================================================== # Feature Save & Categorical Encoding & Feature Save #======================================================================== train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) #======================================================================== # Numeric Feature Save #======================================================================== for col in train.columns: if col in categorical_features:continue result_train = train[col].values result_test = test[col].values logger.info(f"COL: {col} | LENGTH: {len(result_train)}") utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}') if col != target: utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
def get_feature_set(feat_path='../features/all_features/*.gz', feat_key='', is_debug=False, is_cat_encode=True): feat_path_list = glob.glob(feat_path) path_list = [] for path in feat_path_list: filename = re.search(r'/([^/.]*).gz', path).group(1) if path.count(feat_key) and feat_key[:4] == filename[:4]: path_list.append(path) train, test = ml_utils.get_train_test(feat_path_list=path_list, target=target) print(train.shape, test.shape) if is_debug: train = train.head(10000) test = test.head(500) if is_cat_encode: #======================================================================== # Categorical Encode cat_cols = utils.get_categorical_features(df=train, ignore_list=ignore_list) print(f"Categorical: {cat_cols}") #Fit LabelEncoder for col in cat_cols: # 最も頻度の多いカテゴリでimpute max_freq = list(train[col].value_counts().index)[0] train[col].fillna(max_freq, inplace=True) test[col].fillna(max_freq, inplace=True) le = LabelEncoder().fit( pd.concat([train[col], test[col]], axis=0).value_counts().index.tolist()) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) #======================================================================== return train, test
def multi_level_agg(df, prefix): # ======================================================================= # 複数カテゴリの組み合わせを集計値に置き換える # ======================================================================= method_list = ['mean'] num_list = ['EXT_SOURCE_2'] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) cat_combi = combinations(cat_list, 2) # amt_list = [col for col in num_list if col.count('AMT_')] # days_list = [col for col in num_list if col.count('DAYS_')] # 直列処理 for com in cat_combi: for num in num_list: for method in method_list: base = df[[key, target] + list(com)].drop_duplicates() tmp = df[list(com)+[num]] tmp_result = base_aggregation( df=tmp, level=list(com), method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=list(com), how='left') for col in result.columns: if not(col.count('@')) or col in ignore_list: continue train_feat = result[result[target]>=0][col].values test_feat = result[result[target].isnull()][col].values col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_') train_file_path = f"../features/1_first_valid/train_{col}" test_file_path = f"../features/1_first_valid/test_{col}" utils.to_pkl_gzip(obj=train_feat, path=train_file_path) utils.to_pkl_gzip(obj=test_feat, path=test_file_path) logger.info(f''' #======================================================================== # COMPLETE MAKE FEATURE : {train_file_path} #========================================================================''') del result, tmp_result gc.collect()
def get_dataset(is_debug=False, is_cat_encode=True, feat_path='../features/4_winner/*.gz', base=[]): feat_path_list = glob.glob(feat_path) # feat_path_list += glob.glob('../features/5_tmp/*.gz') train, test = ml_utils.get_train_test(feat_path_list=feat_path_list, target=target, base=base) print(train.shape, test.shape) # if is_debug: # train = train.head(10000) # test = test.head(500) if is_cat_encode: #======================================================================== # Categorical Encode cat_cols = utils.get_categorical_features(df=train, ignore_list=ignore_list) print(f"Categorical: {cat_cols}") #Fit LabelEncoder for col in cat_cols: # 最も頻度の多いカテゴリでimpute max_freq = list(train[col].value_counts().index)[0] train[col].fillna(max_freq, inplace=True) test[col].fillna(max_freq, inplace=True) le = LabelEncoder().fit( pd.concat([train[col], test[col]], axis=0).value_counts().index.tolist()) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) #======================================================================== print(train.shape, test.shape) return train, test
def single_level_agg(df, prefix): # ======================================================================= # 1つのカテゴリを集計値に置き換える # ======================================================================= method_list = ['mean', 'var'] num_list = ['EXT_SOURCE_2'] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) # amt_list = [col for col in num_list if col.count('AMT_')] # days_list = [col for col in num_list if col.count('DAYS_')] # 直列処理 for cat in cat_list: if len(df[cat].unique())<=3: continue for num in num_list: for method in method_list: base = df[[key, cat, target]].drop_duplicates() tmp = df[[cat, num]] tmp_result = base_aggregation( df=tmp, level=cat, method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=cat, how='left') for col in result.columns: if not(col.count('@')) or col in ignore_list: continue train_file_path = f"../features/1_first_valid/train_{col}" test_file_path = f"../features/1_first_valid/test_{col}" utils.to_pkl_gzip(obj=result[result[target]>=0][col].values, path=train_file_path) utils.to_pkl_gzip(obj=result[result[target].isnull()][col].values, path=test_file_path) logger.info(f''' #======================================================================== # COMPLETE MAKE FEATURE : {train_file_path} #========================================================================''') del result, tmp_result gc.collect()
def clean_app(app): logger.info(f''' #============================================================================== # APPLICATION #==============================================================================''' ) app['CODE_GENDER'].replace('XNA', 'F', inplace=True) cat_cols = get_categorical_features(df=app, ignore_list=[]) for col in cat_cols: app[col].fillna('XNA', inplace=True) ' revo ' # revo = 'Revolving loans' # amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE'] # for col in amt_list: # app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE']==revo, np.nan) # app[col] = app[col].where(app[f'NAME_CONTRACT_TYPE']!=revo, np.nan) utils.to_df_pkl(df=app, path='../input', fname='clean_application_train_test')
comment = sys.argv[1] if sys.argv[2].count('f'): train, test = MS_utils.get_feature_set(feat_key=sys.argv[2], base_path=base_path) else: train, test = MS_utils.get_dataset(base=base) print(train.shape, test.shape) if is_debug: train = train.head(10000) test = test.head(5000) #======================================================================== # Categorical Encode cat_cols = utils.get_categorical_features(df=train, ignore_list=ignore_list) print(f"Categorical: {cat_cols}") #Fit LabelEncoder for col in cat_cols: # 最も頻度の多いカテゴリでimpute max_freq = list(train[col].value_counts().index)[0] train[col].fillna(max_freq, inplace=True) test[col].fillna(max_freq, inplace=True) le = LabelEncoder().fit( pd.concat([train[col], test[col]], axis=0).value_counts().index.tolist()) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) #========================================================================
def Regressor(model_type, x_train, x_val, y_train, y_val, x_test, params={}, seed=1208, get_score='rmse', get_model=False, early_stopping_rounds=100, num_boost_round=10000): if str(type(x_train)).count('DataFrame'): use_cols = x_train.columns else: use_cols = np.arange(x_train.shape[1]) + 1 if model_type=='linear': estimator = LinearRegression(**params) elif model_type=='ridge': estimator = Ridge(**params) elif model_type=='lasso': estimator = Lasso(**params) elif model_type=='rmf': params['n_jobs'] = -1 params['n_estimators'] = 10000 estimator = RandomForestRegressor(**params) elif model_type=='lgb': if len(params.keys())==0: metric = 'auc' params['n_jobs'] = 32 params['metric'] = metric params['num_leaves'] = 31 params['colsample_bytree'] = 0.3 params['lambda_l2'] = 1.0 params['learning_rate'] = 0.01 num_boost_round = num_boost_round params['objective'] = 'regression' params['metric'] = 'mse' #======================================================================== # Fitting if model_type!='lgb': estimator.fit(x_train, y_train) else: lgb_train = lgb.Dataset(data=x_train, label=y_train) lgb_val = lgb.Dataset(data=x_val, label=y_val) cat_cols = utils.get_categorical_features(df=x_train) estimator = lgb.train( params = params ,train_set = lgb_train ,valid_sets = lgb_val ,early_stopping_rounds = early_stopping_rounds ,num_boost_round = num_boost_round ,categorical_feature = cat_cols ,verbose_eval = 200 ) #======================================================================== #======================================================================== # Prediction oof_pred = estimator.predict(x_val) if len(x_test): test_pred = estimator.predict(x_test) else: test_pred = [] #======================================================================== #======================================================================== # Scoring if get_score=='auc': score = roc_auc_score(y_val, oof_pred) else: score = np.sqrt(mean_squared_error(y_val, oof_pred)) r2 = r2_score(y_val, oof_pred) print(f""" # R2 Score: {r2} """) # Model : {model_type} # feature : {x_train.shape, x_val.shape} #======================================================================== if model_type=='lgb': feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns) feim.sort_values(by='importance', ascending=False, inplace=True) elif model_type=='lasso' or model_type=='ridge': feim = pd.Series(estimator.coef_, index=use_cols, name='coef') feim.sort_values(ascending=False, inplace=True) if get_model: return score, oof_pred, test_pred, feim, estimator else: return score, oof_pred, test_pred, feim, 0
def data_check(logger, df, target, test=False, dummie=0, exclude_category=False, ignore_list=[]): ''' Explain: 学習を行う前にデータに問題がないかチェックする カテゴリカルなデータが入っていたらエンコーディング or Dropする Args: Return: ''' logger.info(f''' #============================================================================== # DATA CHECK START #==============================================================================''' ) categorical_list = get_categorical_features(df, ignore_list=ignore_list) dt_list = get_datetime_features(df, ignore_list=ignore_list) logger.info(f''' #============================================================================== # CATEGORICAL FEATURE: {categorical_list} # LENGTH: {len(categorical_list)} # DUMMIE: {dummie} #============================================================================== ''') #======================================================================== # 連続値として扱うべきカラムがobjectになっていることがあるので #======================================================================== # for cat in categorical_list: # try: # df[cat] = df[cat].astype('int64') # categorical_list.remove(cat) # except ValueError: # pass #======================================================================== # datetime系のカラムはdrop #======================================================================== for dt in dt_list: df.drop(dt, axis=1, inplace=True) ' 対象カラムのユニーク数が100より大きかったら、ラベルエンコーディングにする ' label_list = [] for cat in categorical_list: if len(df[cat].drop_duplicates()) > 100: label_list.append(cat) categorical_list.remove(cat) df = factorize_categoricals(df, label_list) if exclude_category: for cat in categorical_list: df.drop(cat, axis=1, inplace=True) move_feature(feature_name=cat) categorical_list = [] elif dummie == 1: df = get_dummies(df, categorical_list) categorical_list = [] elif dummie == 0: df = factorize_categoricals(df, categorical_list) categorical_list = [] logger.info(f'df SHAPE: {df.shape}') ' Testsetで値のユニーク数が1のカラムを除外する ' drop_list = [] if test: for col in df.columns: length = df[col].nunique() if length <= 1 and col not in ignore_list: logger.info(f''' ***********WARNING************* LENGTH {length} COLUMN: {col}''') move_feature(feature_name=col) if col != target: drop_list.append(col) logger.info(f''' #============================================================================== # DATA CHECK END #==============================================================================''' ) return df, drop_list
def main(): ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': # ======================================================================= # 集計するカラムリストを用意 # ======================================================================= num_list = get_numeric_features(df=df, ignore=ignore_list) # ======================================================================= # 集計開始 # ======================================================================= for num in num_list: for method in method_list: arg_list.append(df, key, num, method, prefix, '', base) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' call_list = pararell_process(pararell_wrapper(base_aggregation), arg_list) result = pd.concat(call_list, axis=1) for col in result.columns: if not(col.count('@')) or col in ignore_list: continue print(col) # utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values) sys.exit() # for num in num_list: # for method in method_list: # tmp_result = base_aggregation(df=df, level=key, method=method, prefix=prefix, feature=num, drop=True) # result = base.merge(tmp_result, on=key, how='left') # for col in result.columns: # if not(col.count('@')) or col in ignore_list: # continue # utils.to_pickle( # path=f"{dir}/{col}.fp", obj=result[col].values) # make_npy(result=result, ignore_list=ignore_features, logger=logger) elif agg_code == 'caliculate': ''' CALICULATION 複数カラムを四則演算し新たな特徴を作成する ''' f1_list = [] f2_list = [] used_lsit = [] for f1 in f1_list: for f2 in f2_list: ' 同じ組み合わせの特徴を計算しない ' if f1 == f2: continue if sorted([f1, f2]) in used_list: continue used_list.append(sorted([f1, f2])) if diff: df = diff_feature(df=df, first=f1, second=f2) elif div: df = division_feature(df=df, first=f1, second=f2) elif pro: df = product_feature(df=df, first=f1, second=f2) for col in df.columns: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns inf col.count('cntec')] for col in cnt_cols: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': arg_list = [] ' カテゴリカラム ' cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' pararell_process(pararell_wrapper(select_category_value_agg), arg_list) # select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method, ignore_list, prefix) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)
# =========================================================================== # DATA LOAD # =========================================================================== base = utils.read_df_pkl(path='../input/base_app*') fname = 'app' prefix = f'{feat_no}{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p') train = df[~df[target].isnull()] test = df[df[target].isnull()] neighbor = '110_app_neighbor81@' train[neighbor] = utils.read_pkl_gzip('../input/[email protected]') test[neighbor] = utils.read_pkl_gzip('../input/[email protected]') combi = [neighbor, cat] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) #======================================================================== # TARGET ENCODING #======================================================================== for cat in cat_list: combi = cat feat_train, feat_test = target_encoding(logger=logger, train=train, test=test, key=key, level=combi, target=target, fold_type='stratified', group_col_name='', prefix='',
# Global Variable from info_home_credit import hcdr_key_cols key, target, ignore_list = hcdr_key_cols() #======================================================================== app = utils.read_df_pkl(path='../input/clean_app*.p')[[key, target]] filekey='bureau' filepath = f'../input/clean_{filekey}*.p' df = utils.read_df_pkl(path=filepath) df = df.merge(app, on=key, how='inner') train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) mkdir_func(f'../features/{filekey}') #======================================================================== # Numeric Feature Save #======================================================================== for col in train.columns: if col in categorical_features:continue utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}') if col != target: utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}') #======================================================================== # Categorical Feature Encode
def impute_regression(base, level, dataset, value, prefix=''): ''' Explain: Args: base : 最後に特徴量をマージしてインデックスを揃える為のDF level: baseをマージする粒度 Return: ''' logger.info(f'\nimpute feature: {value}') ' 目的変数にマイナスが入っていた場合、分布の最小値が0となるように加算(対数変換のため) ' values = dataset[value].values min_val = values[~np.isnan(values)].min() if min_val < 0: dataset[value] = dataset[value].values + min_val * -1 dataset[target] = dataset[target].map(lambda x: None if x == -1 else x) ' 目的のカラムにおいてNullとなっている行がTestで、値が入ってる行がTrain ' ' is Noneの使い方がわからんので、これでNull判別 ' dataset['is_train'] = dataset[value].map(lambda x: 1 if np.abs(x) >= 0 else 0) ' カテゴリ変数があったらとりあえず整数値にしとく ' categorical = get_categorical_features(dataset, []) dataset = factorize_categoricals(dataset, categorical) train = dataset.query('is_train==1') test = dataset.query('is_train==0') ' カラムにNullがなかったら抜ける ' if len(train) == 0 or len(test) == 0: return # train.drop(['is_train', 'is_test'], axis=1, inplace=True) # test.drop(['is_train', 'is_test'], axis=1, inplace=True) train.drop(['is_train'], axis=1, inplace=True) test.drop(['is_train'], axis=1, inplace=True) train[target] = train[target].fillna(-1) ' ターゲットが全て-1のとき ' if len(train[target].drop_duplicates()) == 1: train[f'bin10_{value}'] = pd.qcut(x=train[value], q=2, duplicates='drop') train = factorize_categoricals(train, [f'bin10_{value}']) validation = set_validation(train, target=f'bin10_{value}', unique_id=unique_id, val_col=val_col) train.drop(f'bin10_{value}', axis=1, inplace=True) else: validation = set_validation(train, target, unique_id=unique_id, val_col=val_col) train = train.merge(validation, on=unique_id, how='left') train[val_col] = train[val_col].fillna(-1) ' imputeするfeatureのノイズとなりそうなfeatureはデータセットから除く ' for col in train.columns: # if col==target or (col.count('impute') and not(col.count('EXT'))): if col == target: logger.info(f'extract feature: {col}') train.drop(col, axis=1, inplace=True) # x, y = train_test_split(train, test_size=0.2) # x['valid_no'] = 0 # y['valid_no'] = 1 # valid_no = 1 # train = pd.concat([x, y], axis=0) logger.info(f'train shape: {train.shape}') ' testに対する予測結果(array)が戻り値 ' impute_value, cv_score = cross_prediction( logger=logger, train=train, test=test, target=value, # categorical_feature=categorical, val_col=val_col) if cv_score < 0.25: return 0 ' 目的変数にマイナスが入っていた場合、対数変換の関係で行った前処理を元に戻す ' if min_val < 0: train[value] = train[value].values + min_val impute_value = impute_value + min_val ' データセットにJoinする際のインデックスソートを揃える為、unique_idをカラムに戻す ' train.reset_index(inplace=True) test.reset_index(inplace=True) train = train[level + [value]] test = test[level + [value]] test[value] = impute_value result = pd.concat([train, test], axis=0) result = base.merge(result, on=level, how='left') print(result.shape) print(result.head()) print(result.tail()) # result.set_index(unique_id, inplace=True) # print(result.loc[check_id, :]) # print(result.query('is_test==1').head(10)) # sys.exit() np.save(f'../features/1_first_valid/{prefix}{value}_impute', result[value].values) return cv_score
def main(): path = f'../input/{sys.argv[1]}*' df = utils.read_df_pickle(path=path) prefix = sys.argv[2] ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': one_base_agg(df=df, prefix=prefix) elif agg_code == 'caliculate': df = two_calicurate(df=df) if prefix != 'app_': one_base_agg(df=df, prefix=prefix) else: for col in df.columns: utils.to_pickle(path=f"{dir}/{prefix}{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns if col.count('cntec')] for col in cnt_cols: if exclude_feature(col, df[col].values): continue utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': ' カテゴリカラムの中のvalue毎に集計する ' arg_list = [] cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method=method, ignore_list=ignore_list, prefix=prefix) # arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) # pararell_process(select_cat_wrapper, arg_list) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)