def two_calicurate(df): ''' CALICULATION 複数カラムを四則演算し新たな特徴を作成する ''' f1_list = get_numeric_features(df=df, ignore=ignore_list) f2_list = get_numeric_features(df=df, ignore=ignore_list) used_lsit = [] for f1 in f1_list: for f2 in f2_list: ' 同じ組み合わせの特徴を計算しない ' if f1 == f2: continue if sorted([f1, f2]) in used_list: continue used_list.append(sorted([f1, f2])) ' For home-credit' if (not (f1.count('revo')) and f2.count('revo')) or (f1.count('revo') and not (f2.count('revo'))): continue if diff: df = diff_feature(df=df, first=f1, second=f2) if div: df = division_feature(df=df, first=f1, second=f2) if pro: df = product_feature(df=df, first=f1, second=f2) use_cols = [] for col in df.columns: if col.count('_div_') or col.count('_diff_') or col.count('_pro_'): use_cols.append(col) df = df[[key] + use_cols] return df
def make_num_features(df, filekey): mkdir_func(f'../features/{filekey}') # if filekey.count('bur'): df = interact_feature(df, filekey) #======================================================================== # カテゴリの内容別にNumeric Featureを切り出す #======================================================================== num_list = get_numeric_features(df=df, ignore_list=ignore_list) cat_list = get_categorical_features(df=df, ignore_list=[]) # few_list = [] # for cat in tqdm(cat_list): # for val in tqdm(df[cat].drop_duplicates()): # length = len(df[df[cat]==val]) # if length < len(df)*0.002: # few_list.append(val) # continue # for num in num_list: # # pararell_process(, num_list) # df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan) # df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan) logger.info(f'{fname} SET SHAPE : {df.shape}') #======================================================================== # Feature Save & Categorical Encoding & Feature Save #======================================================================== train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) #======================================================================== # Numeric Feature Save #======================================================================== for col in train.columns: if col in categorical_features:continue result_train = train[col].values result_test = test[col].values logger.info(f"COL: {col} | LENGTH: {len(result_train)}") utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}') if col != target: utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
def one_base_agg(df, prefix): # ======================================================================= # 集計するカラムリストを用意 # ======================================================================= num_list = get_numeric_features(df=df, ignore=ignore_list) # 並列処理→DFが重いと回らないかも # arg_list = [] # for num in num_list: # for method in method_list: # tmp = df[[key, num]] # arg_list.append([tmp, key, num, method, prefix, '', base]) # ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' # call_list = pararell_process(base_agg_wrapper, arg_list) # result = pd.concat(call_list, axis=1) # for col in result.columns: # if not(col.count('@')) or col in ignore_list: # continue # # utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values) # sys.exit() # 直列処理 for num in num_list: for method in method_list: tmp = df[[key, num]] tmp_result = base_aggregation(df=tmp, level=key, method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=key, how='left') renu = result[result[target].isnull()] for col in result.columns: if not (col.count('@')) or col in ignore_list: continue if exclude_feature(col, result[col].values): continue if exclude_feature(col, renu[col].values): continue file_path = f"{dir}/{col}.fp" # utils.to_pickle(path=file_path, obj=result[col].values) utils.to_pkl_gzip(obj=result[col].values, path=file_path) del result, renu, tmp_result gc.collect()
def num_cat_encoding(df, bins=0, isfill=False, origin_drop=True): ''' Explain: Numeric to binning Args: Return: ''' bin_list = get_numeric_features(df=df, ignore=ignore_features) logger.info(df.shape) for col in bin_list: # 必要ならNullは中央値で埋める if isfill: df[col] = df[col].replace(np.inf, np.nan) df[col] = df[col].replace(-1 * np.inf, np.nan) df[col] = df[col].fillna(df[col].median()) # binにする数よりユニーク数が少ない場合は除外 length = len(df[col].drop_duplicates()) if length < bins: continue df[f'bin{bins}_{col}'] = pd.qcut(x=df[col], q=bins, duplicates='drop') if origin_drop: df.drop(col, axis=1, inplace=True)
def main(): ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': # ======================================================================= # 集計するカラムリストを用意 # ======================================================================= num_list = get_numeric_features(df=df, ignore=ignore_list) # ======================================================================= # 集計開始 # ======================================================================= for num in num_list: for method in method_list: arg_list.append(df, key, num, method, prefix, '', base) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' call_list = pararell_process(pararell_wrapper(base_aggregation), arg_list) result = pd.concat(call_list, axis=1) for col in result.columns: if not(col.count('@')) or col in ignore_list: continue print(col) # utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values) sys.exit() # for num in num_list: # for method in method_list: # tmp_result = base_aggregation(df=df, level=key, method=method, prefix=prefix, feature=num, drop=True) # result = base.merge(tmp_result, on=key, how='left') # for col in result.columns: # if not(col.count('@')) or col in ignore_list: # continue # utils.to_pickle( # path=f"{dir}/{col}.fp", obj=result[col].values) # make_npy(result=result, ignore_list=ignore_features, logger=logger) elif agg_code == 'caliculate': ''' CALICULATION 複数カラムを四則演算し新たな特徴を作成する ''' f1_list = [] f2_list = [] used_lsit = [] for f1 in f1_list: for f2 in f2_list: ' 同じ組み合わせの特徴を計算しない ' if f1 == f2: continue if sorted([f1, f2]) in used_list: continue used_list.append(sorted([f1, f2])) if diff: df = diff_feature(df=df, first=f1, second=f2) elif div: df = division_feature(df=df, first=f1, second=f2) elif pro: df = product_feature(df=df, first=f1, second=f2) for col in df.columns: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns inf col.count('cntec')] for col in cnt_cols: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': arg_list = [] ' カテゴリカラム ' cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' pararell_process(pararell_wrapper(select_category_value_agg), arg_list) # select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method, ignore_list, prefix) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)
f1 = feat_combi[0] f2 = feat_combi[1] feat1 = diff_feature(df=df, first=f1, second=f2) feat2 = division_feature(df=df, first=f1, second=f2) feat3 = product_feature(df=df, first=f1, second=f2) feat = pd.concat([feat1, feat2, feat3], axis=1) return feat if arithmetic: used_list = [] ''' CALICULATION 複数カラムを四則演算し新たな特徴を作成する ''' num_list = get_numeric_features(df=df, ignore_list=ignore_list) amt_list = [col for col in num_list if col.count('AMT_')] days_list = [ col for col in num_list if col.count('DAYS_') or col.count('OWN') ] num_list = amt_list + days_list ext_list = [col for col in df.columns if col.count('EXT_')] f1_list = num_list f2_list = num_list # f1_list = ext_list # f2_list = ext_list # f2_list = ['EXT_SOURCE_2'] used_lsit = [] result_feat = pd.DataFrame()
def main(): path = f'../input/{sys.argv[1]}*' df = utils.read_df_pickle(path=path) prefix = sys.argv[2] ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': one_base_agg(df=df, prefix=prefix) elif agg_code == 'caliculate': df = two_calicurate(df=df) if prefix != 'app_': one_base_agg(df=df, prefix=prefix) else: for col in df.columns: utils.to_pickle(path=f"{dir}/{prefix}{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns if col.count('cntec')] for col in cnt_cols: if exclude_feature(col, df[col].values): continue utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': ' カテゴリカラムの中のvalue毎に集計する ' arg_list = [] cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method=method, ignore_list=ignore_list, prefix=prefix) # arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) # pararell_process(select_cat_wrapper, arg_list) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)