def bq_parallel(query, prefix): bq_client = bigquery.Client.from_service_account_json(key_file) df = bq_client.query(query).to_dataframe() print(f"Result Shape: {df.shape}") # prefix = 'f008_big_ins-cur-l3-' # prefix = 'f008_big_ins-pre-f3-' # prefix = 'f008_big_ins-pre-l3-' base = utils.read_df_pkl('../input/base0*').set_index(key) df.set_index(key, inplace=True) df = base.join(df) utils.save_feature(df_feat=df, ignore_list=ignore_list, is_train=2, prefix=prefix, target=target)
def clean_prev(pre): logger.info(f''' #============================================================================== # PREV CLEANSING #==============================================================================''' ) cash = 'Cash loans' revo = 'Revolving loans' pre = utils.read_df_pkl(path='../input/previous*.p') pre['AMT_CREDIT'] = pre['AMT_CREDIT'].where(pre['AMT_CREDIT'] > 0, np.nan) pre['AMT_ANNUITY'] = pre['AMT_ANNUITY'].where(pre['AMT_ANNUITY'] > 0, np.nan) pre['AMT_APPLICATION'] = pre['AMT_APPLICATION'].where( pre['AMT_APPLICATION'] > 0, np.nan) pre['CNT_PAYMENT'] = pre['CNT_PAYMENT'].where(pre['CNT_PAYMENT'] > 0, np.nan) pre['AMT_DOWN_PAYMENT'] = pre['AMT_DOWN_PAYMENT'].where( pre['AMT_DOWN_PAYMENT'] > 0, np.nan) pre['RATE_DOWN_PAYMENT'] = pre['RATE_DOWN_PAYMENT'].where( pre['RATE_DOWN_PAYMENT'] > 0, np.nan) pre['DAYS_FIRST_DRAWING'] = pre['DAYS_FIRST_DRAWING'].where( pre['DAYS_FIRST_DRAWING'] < 100000, np.nan) pre['DAYS_FIRST_DUE'] = pre['DAYS_FIRST_DUE'].where( pre['DAYS_FIRST_DUE'] < 100000, np.nan) pre['DAYS_LAST_DUE_1ST_VERSION'] = pre['DAYS_LAST_DUE_1ST_VERSION'].where( pre['DAYS_LAST_DUE_1ST_VERSION'] < 100000, np.nan) pre['DAYS_LAST_DUE'] = pre['DAYS_LAST_DUE'].where( pre['DAYS_LAST_DUE'] < 100000, np.nan) pre['DAYS_TERMINATION'] = pre['DAYS_TERMINATION'].where( pre['DAYS_TERMINATION'] < 100000, np.nan) # pre['SELLERPLACE_AREA'] = pre['SELLERPLACE_AREA'].where(pre['SELLERPLACE_AREA'] <200, 200) ignore_list = [ 'SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_TYPE', 'NAME_CONTRACT_STATUS' ] ' revo ' ' RevolvingではCNT_PAYMENT, AMT系をNULLにする ' # for col in pre.columns: # if col in ignore_list: # logger.info(f'CONTINUE: {col}') # continue # pre[f'revo_{col}'] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']==revo, np.nan) # pre[col] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']!=revo, np.nan) pre['NAME_TYPE_SUITE'].fillna('XNA', inplace=True) pre['PRODUCT_COMBINATION'].fillna('XNA', inplace=True) pre = utils.to_df_pkl(df=pre, path='../input', fname='clean_prev')
def clean_bureau(bur): logger.info(f''' #============================================================================== # BUREAU CLEANSING #==============================================================================''' ) bur = utils.read_df_pkl(path='../input/bureau*.p') bur = bur[bur['CREDIT_CURRENCY'] == 'currency 1'] bur['DAYS_CREDIT_ENDDATE'] = bur['DAYS_CREDIT_ENDDATE'].where( bur['DAYS_CREDIT_ENDDATE'] > -36000, np.nan) bur['DAYS_ENDDATE_FACT'] = bur['DAYS_ENDDATE_FACT'].where( bur['DAYS_ENDDATE_FACT'] > -36000, np.nan) bur['DAYS_CREDIT_UPDATE'] = bur['DAYS_CREDIT_UPDATE'].where( bur['DAYS_CREDIT_UPDATE'] > -36000, np.nan) bur = utils.to_df_pkl(df=bur, path='../input', fname='clean_bureau')
params['colsample_bytree'] = 0.3 params['min_child_samples'] = 50 else: params['subsample'] = 0.9 params['colsample_bytree'] = 0.3 params['min_child_samples'] = 30 start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) #======================================================================== # Data Load tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') base = utils.read_df_pkl('../input/base_term*0*')[[key, target, col_term]] base[col_term] = base[col_term].map( lambda x: 4 if x <= 4 else 5 if x <= 5 else 6 if 6 <= x and x <= 8 else 9 if 9 <= x and x <= 12 else 15 if 13 <= x and x <= 15 else 18 if 16 <= x and x <= 18 else 24 if 19 <= x else x) print(base[col_term].value_counts()) sys.exit() base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) win_path = sys.argv[1] win_path = f'../features/4_winner/*.gz' win_path = f'../model/LB3670_70leaves_colsam0322/*.gz' # win_path_list = glob.glob(win_path) + tmp_path_list win_path_list = glob.glob(win_path)
# value = np.random.choice(a=value, size=val_len) # except ValueError: # pass # df_dict[uid] = " ".join(value) # return df_dict # tmp_dict = {} # p_list = pararell_process(pararell_val_join, tmp.items()) # [tmp_dict.update(p) for p in p_list] # tmp_train = pd.Series(tmp_dict).to_frame() # tmp_train = tmp_train.join(train_df.set_index('qid')) # tmp_train.rename(columns={0:qt}, inplace=True) # utils.to_df_pkl(df=tmp_train, path='../input/', fname='wn_bagging_train') # sys.exit() tmp_train = utils.read_df_pkl(path='../input/wn_bagging_train*.p') tmp_train = tmp_train.head(50000) ## split to train and val train_df, val_df = train_test_split(tmp_train, test_size=0.2, random_state=seed) ## some config values embed_size = 300 # how big is each word vector max_features = 50000 # how many unique words to use (i.e num rows in embedding vector) maxlen = 100 # max number of words in a question to use ## fill up the missing values train_X = train_df["question_text"].fillna("_na_").values val_X = val_df["question_text"].fillna("_na_").values # test_X = test_df["question_text"].fillna("_na_").values
argv[2]: feature_key """ # Basic Args seed = 1208 set_type = 'all' fold_n = 4 key, raw_target, ignore_list = MS_utils.get_basic_var() ignore_list = [key, raw_target] comment = sys.argv[1] # Base vi_col = 'f000_AvSigVersion' base_path = '../input/base_exclude*' base_path = '../input/base_Av*' # base_path = '../input/base_group*' base = utils.read_df_pkl(base_path) #======================================================================== # Make Validation cv = KFold(n_splits=fold_n, shuffle=False, random_state=seed) if is_debug: base_train = base[base[raw_target].isnull()].head(10000).sort_values( by=vi_col) else: base_train = base[base[raw_target].isnull()].sort_values(by=vi_col) kfold = list(cv.split(base_train[[key, 'country_group']], base_train[vi_col])) del base_train gc.collect() base = base[[key, raw_target]] #========================================================================
import utils from utils import logger_func, get_categorical_features, get_numeric_features, pararell_process, mkdir_func from preprocessing import get_dummies from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding logger = logger_func() pd.set_option('max_columns', 200) pd.set_option('max_rows', 200) #======================================================================== # Global Variable from info_home_credit import hcdr_key_cols key, target, ignore_list = hcdr_key_cols() #======================================================================== app = utils.read_df_pkl(path='../input/clean_app*.p')[[key, target]] filekey='bureau' filepath = f'../input/clean_{filekey}*.p' df = utils.read_df_pkl(path=filepath) df = df.merge(app, on=key, how='inner') train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) mkdir_func(f'../features/{filekey}') #======================================================================== # Numeric Feature Save
def raddar_top_freq_merchant_agg(df, agg_term, new_max, new_min, old_max, old_min): #======================================================================== if debug: df = auth1.head(100000) # new_max, new_min, old_max, old_min = 0,-1,-2,-3 # agg_term = -20 # Args Setting level = [key, 'merchant_id', 'month_lag'] # merchantの利用頻度を集計する期間を絞る(month_lagの値) new_month_lag_max = new_max new_month_lag_min = new_min old_month_lag_max = old_max old_month_lag_min = old_min agg_term = agg_term #======================================================================== #======================================================================== # Aggregation print("Aggregation Start!!") aggs = {} aggs['purchase_amount_new'] = ['sum'] aggs['installments'] = ['mean', 'max', 'sum'] df_agg = df.groupby(level).agg(aggs) new_cols = get_new_columns(name='', aggs=aggs) df_agg.columns = new_cols df_agg[f'purchase_amount_new_sum_per_installments_sum'] = df_agg[ f'purchase_amount_new_sum'] / df_agg[f'installments_sum'] mer_cnt = df.groupby([key, 'merchant_id' ])['month_lag'].nunique().reset_index().rename( columns={'month_lag': 'month_lag_cnt'}) df_agg = df_agg.reset_index().merge(mer_cnt, how='inner', on=[key, 'merchant_id']) #======================================================================== #======================================================================== # month_lag別に切り出して集計を行う print("Aggregate Term Setting") df_merchant = df_agg[df_agg['month_lag_cnt'] > 1] del df_agg gc.collect() df_merchant.drop(['month_lag_cnt'], axis=1, inplace=True) # 期間を絞る new_term = new_month_lag_max - new_month_lag_min old_term = old_month_lag_max - old_month_lag_min new = df_merchant[df_merchant['month_lag'] <= new_month_lag_max][ df_merchant['month_lag'] >= new_month_lag_min] old = df_merchant[df_merchant['month_lag'] <= old_month_lag_max][ df_merchant['month_lag'] >= old_month_lag_min] feat_cols = [ col for col in df_merchant.columns if col.count('amount') or col.count('install') ] aggs = {} for col in feat_cols: if col.count('install') and not (col.count('per')): aggs[col] = ['mean'] else: aggs[col] = ['sum'] # 複数month_lagをもつデータの場合は、集計する if new_term > 0: new = new.groupby([key, 'merchant_id'])[feat_cols].agg(aggs) new_cols = get_new_columns(name='', aggs=aggs) new.columns = new_cols else: new.set_index([key, 'merchant_id'], inplace=True) if old_term > 0: old = old.groupby([key, 'merchant_id'])[feat_cols].agg(aggs) new_cols = get_new_columns(name='', aggs=aggs) old.columns = new_cols else: old.set_index([key, 'merchant_id'], inplace=True) #======================================================================== new.reset_index(inplace=True) old.reset_index(inplace=True) #======================================================================== # oldに存在するがnewにいないcard_id, merchantをnewにもたせる。 print("Get Lost Merchant and Card ID") new['flg'] = 1 tmp_cols = [key, 'merchant_id'] old_lost = old[tmp_cols].merge(new[tmp_cols + ['flg']], how='left', on=[key, 'merchant_id']) old_lost = old_lost[old_lost['flg'].isnull()] old_lost = old_lost[tmp_cols] new = pd.concat([new, old_lost], ignore_index=True) new.drop('flg', axis=1, inplace=True) #======================================================================== #======================================================================== # Make Ratio Feature print("Make Ratio Feature") feat_cols = [ col for col in new.columns if col.count('amount') or col.count('install') ] fname = f'flag{new_month_lag_max}_{new_month_lag_min}-plag{old_month_lag_max}_{old_month_lag_min}' new = new.merge(old, how='left', on=[key, 'merchant_id']) for col in feat_cols: new[f"{fname}_{col}"] = new[col + '_x'] / new[col + '_y'] new[f"{fname}_{col}"].fillna(0, inplace=True) #======================================================================== #======================================================================== # card_id * merchant_id別のtop frequency ranking # all term version if debug: df = df.head(1000000) df_term = df[df['month_lag'] >= agg_term] mer_cnt = df_term.groupby([key, 'merchant_id' ])['month_lag'].nunique().reset_index().rename( columns={'month_lag': 'month_lag_cnt'}) mer_cnt.sort_values(by=[key, 'month_lag_cnt'], ascending=False, inplace=True) mer_cnt = utils.row_number(df=mer_cnt, level=key) mer_cnt.set_index([key, 'merchant_id'], inplace=True) df_merchant = new.set_index(tmp_cols).join(mer_cnt).reset_index() del new, old, mer_cnt gc.collect() use_cols = [key, 'merchant_id'] + [ col for col in df_merchant.columns if col.count('flag') ] + ['month_lag_cnt', 'row_no'] df_merchant = df_merchant[use_cols] #======================================================================== #======================================================================== # merchant_id別に集計を行ったら、 # 1. それらを更に集計する. frequencyが高いmerchantのみで集計するパターンも作る # 2. frequencyの高いmerchnatでまとめてtop1~10カラムを作る ------->>> frequencyについては、全体と直近半年の両パターンでカウントし、特徴を作る #======================================================================== feat_cols = [col for col in df_merchant.columns if col.count('plag')] #======================================================================== # Top10のfrequency merchantをまとめてカラムにする df_merchant = df_merchant[df_merchant['row_no'] <= 10] df_merchant = df_merchant.set_index([key, 'row_no'])[feat_cols].unstack() # Rename fname = 'auth1_all' df_merchant.columns = [ f"{fname}_top-M{col[1]}_merchant_{col[0]}" for col in df_merchant.columns ] #======================================================================== #======================================================================== # Save Feature if agg_term < -13: prefix = '244_rad' else: prefix = f'245_rad_min_Mlag{agg_term}' print(f"{prefix} Feature Saving...") base = utils.read_df_pkl('../input/base_no_out*') base = base[[key, target]].set_index(key) base = base.join(df_merchant) base.fillna(-1, inplace=True) del df_merchant gc.collect() # elo_save_feature(df_feat=base, prefix=prefix) print('Complete!')
pd.set_option('max_rows', 200) from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) # =========================================================================== # global variables # =========================================================================== key = 'SK_ID_CURR' target = 'TARGET' ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV'] # =========================================================================== # DATA LOAD # =========================================================================== base = utils.read_df_pkl(path='../input/base_app*') fname = 'app' prefix = feat_no + f'{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')[[ key, target, 'EXT_SOURCE_2' ]] train_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]') test_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]') ir_mean = np.hstack((train_ir, test_ir)) df['stan_ir_mean@'] = ir_mean df['stan_ir_mean@'].fillna('ir_nan', inplace=True) num_split = 9 df['EXT_bin'] = pd.qcut(x=df['EXT_SOURCE_2'], q=num_split) df['ir_bin'] = pd.qcut(x=df['stan_ir_mean@'], q=num_split)
sys.path.append(f"{HOME}/kaggle/data_analysis/library/") import utils from preprocessing import get_ordinal_mapping from utils import logger_func logger = logger_func() from dimensionality_reduction import go_bhtsne, UMAP #======================================================================== # Data Load win_path = f'../features/4_winner/*.gz' # win_path_list = glob.glob(win_path) win_path_list = glob.glob(f'../features/0_exp/*.gz') base = utils.read_df_pkl('../input/base_term*') feature_list = utils.parallel_load_data(path_list=win_path_list) df_feat = pd.concat(feature_list, axis=1) #======================================================================== start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) prefix = f"224_emb_{start_time[4:12]}" feat_list = [col for col in df_feat.columns] emb_list = [] feat_num = 10 ignore_list = [key, target, 'merchant_id', 'first_avtive_month', 'index'] for i in range(10): np.random.seed(i) tmp_list = np.random.choice(feat_list, feat_num)
if num_leaves > 40: params['subsample'] = 0.8757099996397999 # params['colsample_bytree'] = 0.7401342964627846 params['colsample_bytree'] = 0.3 params['min_child_samples'] = 50 else: params['subsample'] = 0.9 params['colsample_bytree'] = 0.3 params['min_child_samples'] = 30 start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) #======================================================================== # Data Load win_path = '../features/4_winner/*.gz' base = utils.read_df_pkl('../input/base_first*0*') win_path_list = glob.glob(win_path) # tmp_path_listには検証中のfeatureを入れてある tmp_path_list = glob.glob('../features/5_tmp/*.gz') win_path_list += tmp_path_list base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list) df_feat = pd.concat(feature_list, axis=1) train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1) # test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) test = [] #======================================================================== # LGBM Setting
def quara_load_data(): # read pickle train = utils.read_df_pkl(path='../input/train*.p') test = utils.read_df_pkl(path='../input/test*.p') return train, test
def main(): #======================================================================== # Data Load #======================================================================== win_path_list = glob.glob(win_path) train_path_list = [] test_path_list = [] for path in win_path_list: if path.count('train'): train_path_list.append(path) elif path.count('test'): test_path_list.append(path) # train_feature_list = utils.pararell_load_data(path_list=train_path_list, delimiter='gz') # test_feature_list = utils.pararell_load_data(path_list=test_path_list, delimiter='gz') # train = pd.concat(train_feature_list, axis=1) # test = pd.concat(test_feature_list, axis=1) df = utils.read_df_pkl('../input/appli*') train = df[df[target] >= 0] test = df[df[target] == -1] metric = 'auc' fold = 5 fold_type = 'stratified' group_col_name = '' dummie = 1 oof_flg = True LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) train, _ = LGBM.data_check(df=train) test, drop_list = LGBM.data_check(df=test, test_flg=True) if len(drop_list): train.drop(drop_list, axis=1, inplace=True) test.drop(drop_list, axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train, test=test, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, oof_flg=oof_flg) #======================================================================== # Result #======================================================================== cv_score = LGBM.cv_score result = LGBM.prediction cv_feim = LGBM.cv_feim feature_num = len(LGBM.use_cols) cv_feim.to_csv( f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv' ) #======================================================================== # X-RAYの計算と出力 # Args: # model : 学習済のモデル # train : モデルの学習に使用したデータセット # col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、 # データセットの全カラムについて計算を行うが、 # 計算時間を考えると最大30カラム程度を推奨。 #======================================================================== xray = False if xray: train.reset_index(inplace=True) train = train[LGBM.use_cols] result_xray = pd.DataFrame() N_sample = 150000 max_point = 30 for fold_num in range(fold): model = LGBM.fold_model_list[fold_num] if fold_num == 0: xray_obj = Xray_Cal(logger=logger, ignore_list=ignore_list, model=model) xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train, col_list=train.columns, fold_num=fold_num, N_sample=N_sample, max_point=max_point) tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True) if len(result_xray): result_xray.merge(tmp_xray.drop('N', axis=1), on=['feature', 'value'], how='inner') else: result_xray = tmp_xray.copy() del tmp_xray gc.collect() xray_col = [col for col in result_xray.columns if col.count('xray')] result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1) result_xray.to_csv( f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv' ) sys.exit() submit = pd.read_csv('../input/sample_submission.csv') # submit = [] #======================================================================== # STACKING #======================================================================== if len(stack_name) > 0: logger.info(f'result_stack shape: {LGBM.result_stack.shape}') utils.to_pkl( path= f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp", obj=LGBM.result_stack) logger.info( f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv' ) #======================================================================== # Submission #======================================================================== if len(submit) > 0: if stack_name == 'add_nest': test[target] = result test = test.reset_index()[[ key, target ]].groupby(key)[target].mean().reset_index() submit = submit[key].to_frame().merge(test, on=key, how='left') submit[target].fillna(0, inplace=True) submit.to_csv( f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False) else: submit[target] = result submit.to_csv( f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False)
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) # =========================================================================== # global variables # =========================================================================== key = 'SK_ID_CURR' target = 'TARGET' ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV'] # =========================================================================== # DATA LOAD # =========================================================================== fname = 'app' prefix = feat_no + f'{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p') # Null Count df['Null_Cnt@'] = df.isnull().sum(axis=1) # Document doc_list = [col for col in df.columns if col.count('DOCUMENT')] df['DOCUMENT_SUM@'] = df[doc_list].sum(axis=1) # Document 3,5,6,8,11,18 doc_list2 = [col for col in doc_list if col.count('DOCUMENT') and ( col.count('_3') or col.count('_5') or col.count('_6') or col.count('_8') or col.count('_11') or col.count('_18') ) ] df['DOCUMENT_SUM2@'] = df[doc_list2].sum(axis=1) # cat_list = get_categorical_features(df=df, ignore_list=ignore_list) # ボツ # df['NEW_REGION_POPULATION_RELATIVE@'] = (df['REGION_POPULATION_RELATIVE']*10000).astype('int')
prev_key = 'SK_ID_PREV' target = 'CNT_PAYMENT' ignore_list.remove('TARGET') ignore_list.append(target) acr = 'AMT_CREDIT' aan = 'AMT_ANNUITY' adp = 'AMT_DOWN_PAYMENT' cpy = 'CNT_PAYMENT' co_type = 'NAME_CONTRACT_TYPE' dd = 'DAYS_DECISION' #======================================================================== # Data Load #======================================================================== app = utils.read_df_pkl('../input/clean_app*') amt_list = [col for col in app.columns if col.count('AMT_')] prev = utils.read_df_pkl('../input/clean_prev*')[[key, dd, acr, aan, cpy]] df = prev.merge(app.drop(amt_list, axis=1), on=key, how='inner') days_list = [ col for col in df.columns if col.count('DAYS') and not (col.count('DECISION')) ] for col in days_list: df[col] = df[col].values - df[dd].values df.drop([dd, 'TARGET'], axis=1, inplace=True) train = df[~df[cpy].isnull()] test = app
select_type = 'rank' select_num = 50000 #======================================================================== #======================================================================== # Global Variable COMPETITION_NAME = 'home-credit-default-risk' sys.path.append(f"../py") from info_home_credit import hcdr_key_cols key, target, ignore_list = hcdr_key_cols() #======================================================================== #======================================================================== # Data Load feim_path = glob.glob('../valid/use_feim/*.csv')[0] base = utils.read_df_pkl('../input/base0*')[[key, target]].set_index(key) manage = FeatureManage(key, target) manage.set_base(base) if select_type == 'rank': train, test = manage.feature_matrix(feim_path=feim_path, rank=select_num) if select_type == 'gain': train, test = manage.feature_matrix(feim_path=feim_path, gain=select_num) if is_debug: train = train.head(10000) test = test.head(500) Y = train[target] print(train.shape, test.shape) #======================================================================== # Basic Args
# -*- coding: utf-8 -*- import numpy as np import pandas as pd import datetime import sys import re import gc import glob import os HOME = os.path.expanduser('~') sys.path.append(f"{HOME}/kaggle/data_analysis/library/") import utils from utils import logger_func, get_categorical_features, get_numeric_features, pararell_process logger = logger_func() pd.set_option('max_columns', 200) pd.set_option('max_rows', 200) train = utils.read_df_pkl('../input/train0*.p') print(train) sys.exit()
model = Model(inputs=inp, outputs=predictions) adam = optimizers.SGD(lr=learning_rate) if is_multi: model = multi_gpu_model(model, gpus=gpu_count) model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) return model #======================================================================== #======================================================================== # Load Feature Matrix base = utils.read_df_pkl('../input/base_Av*') tx_train = utils.read_df_pkl('../input/0306_MS_NLP_train*').values x_test = utils.read_df_pkl('../input/0306_MS_NLP_test*').values #======================================================================== #======================================================================== # Make Validation seed = 1208 fold_n = 4 vi_col = 'f000_AvSigVersion' base_train = base[~base[target].isnull()] base_test = base[base[target].isnull()] # sort前にyを取得 y = base_train[target].values base_train.sort_values(vi_col, inplace=True) base = base[[key, target]]
from sklearn.metrics import mean_squared_error, roc_auc_score #======================================================================== # Keras # Corporación Favorita Grocery Sales Forecasting sys.path.append(f'{HOME}/kaggle/data_analysis/model') from nn_keras import MS_NN from keras import callbacks from keras import optimizers from keras import backend as K from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau #======================================================================== start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) base = utils.read_df_pkl('../input/base_group*')[[ key, target, 'country_group' ]] # feat_path_list = glob.glob('../features/4_winner/*.gz') train, test = MS_utils.get_dataset(base=base, feat_path='../features/4_winner/*.gz', is_debug=False) del base gc.collect() if is_debug: train = train.head(10000) test = test.head(500) #======================================================================== # Categorical Encode
#======================================================================== # Data Load win_path = f'../features/4_winner/*.gz' # win_path = f'../features/1_first_valid/*.gz' # win_path = f'../model/LB3670_70leaves_colsam0322/*.gz' # win_path = f'../model/LB3679_48leaves_colsam03/*.gz' # win_path = f'../model/LB3684_48leaves_colsam03/*.gz' tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') + glob.glob( f'../features/0_exp/*.gz') # tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') win_path_list = glob.glob(win_path) + tmp_path_list # win_path_list = glob.glob(win_path) base = utils.read_df_pkl('../input/base_term*0*')[[ key, target, col_term, 'first_active_month' ]] fam_base = utils.read_df_pkl('../input/fam_165_176*0*')[[key, target]] base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list) df_feat = pd.concat(feature_list, axis=1) train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1) test = pd.concat( [base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)
sys.path.append(f'{HOME}/kaggle/data_analysis/library') import utils from utils import get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature from preprocessing import get_dummies import datetime from tqdm import tqdm import time import sys from joblib import Parallel, delayed key = 'card_id' target = 'target' ignore_list = [key, target, 'merchant_id', 'first_avtive_month'] df_train = utils.read_df_pkl('../input/train0*') df_train.set_index(key, inplace=True) df_test = utils.read_df_pkl('../input/test0*') df_test.set_index(key, inplace=True) df_hist = pd.read_csv('../input/historical_transactions.csv') df_hist['purchase_amount_new'] = np.round( df_hist['purchase_amount'] / 0.00150265118 + 497.06, 2) df_hist['installments'] = df_hist['installments'].map( lambda x: 1 if x < 1 else 1 if x > 100 else x) #======================================================================== # Dataset Load use_cols = [
# Ensemble 1 set1 = f'../model/E1_set/*.gz' # Ensemble 2 set2 = f'../model/E2_set/*.gz' # Ensemble 3 set3 = f'../model/E3_set/*.gz' # Ensemble 4 set4 = f'../model/E4_set/*.gz' set_list = [set1, set2, set3, set4] win_path = set_list[int(sys.argv[2])] win_path_list = glob.glob(win_path) base = utils.read_df_pkl('../input/base_term*0*')[[ key, target, 'first_active_month' ]] base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list) df = pd.concat(feature_list, axis=1) train = pd.concat([base_train, df.iloc[:len(base_train), :]], axis=1) test = pd.concat( [base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) train.reset_index(inplace=True, drop=True) test.reset_index(inplace=True, drop=True) if out_part == 'no_out': train = train[train[target] > -30] #========================================================================
utils.to_df_pkl(df=ccb, path='../input', fname='clean_ccb') if __name__ == "__main__": # with utils.timer("To Pickle"): # to_pkl() with utils.timer("Cleansing"): # app = utils.read_df_pkl(path='../input/application_train_test*.p') # clean_app(app) # del app # gc.collect() bur = utils.read_df_pkl(path='../input/bureau*.p') clean_bureau(bur) del bur gc.collect() # pre = utils.read_df_pkl(path='../input/prev*.p') # clean_prev(pre) # del pre # gc.collect() # pos = utils.read_df_pkl(path='../input/POS*.p') # clean_pos(pos) # del pos # gc.collect() # ins = utils.read_df_pkl(path='../input/install*.p') # clean_ins(ins) # del ins
pd.set_option('max_rows', 200) from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) # =========================================================================== # global variables # =========================================================================== key = 'SK_ID_CURR' target = 'TARGET' ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV'] # =========================================================================== # DATA LOAD # =========================================================================== base = utils.read_df_pkl(path='../input/base_app*') fname = 'app' prefix = f'{feat_no}{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p') train = df[~df[target].isnull()] test = df[df[target].isnull()] neighbor = '110_app_neighbor81@' train[neighbor] = utils.read_pkl_gzip('../input/[email protected]') test[neighbor] = utils.read_pkl_gzip('../input/[email protected]') combi = [neighbor, cat] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) #======================================================================== # TARGET ENCODING
# Global Variable from info_home_credit import hcdr_key_cols key, target, ignore_list = hcdr_key_cols() #======================================================================== prev_key = 'SK_ID_PREV' acr = 'AMT_CREDIT' aan = 'AMT_ANNUITY' adp = 'AMT_DOWN_PAYMENT' cpy = 'CNT_PAYMENT' co_type = 'NAME_CONTRACT_TYPE' dd = 'DAYS_DECISION' # Curren Applicationに対するCNT_PAYMENTの予測値 df = utils.read_df_pkl('../input/clean_cpy*').reset_index()[[key, 'AMT_CREDIT', 'AMT_ANNUITY', target]] #======================================================================== # Current ApplicationのInterest Rateを計算 #======================================================================== #======================================================================== # Dima's Interest_Rate #======================================================================== df['3']=df['AMT_ANNUITY']*3-df['AMT_CREDIT'] df['3']=df['3']/df['AMT_CREDIT'] df['dima_ir_3@']=df['3'].apply(lambda x: x if 0<=x<=0.5 else np.nan) df['6']=df['AMT_ANNUITY']*6-df['AMT_CREDIT'] df['6']=df['6']/df['AMT_CREDIT'] df['dima_ir_6@']=df['6'].apply(lambda x: x if 0<=x<=0.5 else np.nan) df['9']=df['AMT_ANNUITY']*9-df['AMT_CREDIT']
#======================================================================== prev_key = 'SK_ID_PREV' acr = 'AMT_CREDIT' aan = 'AMT_ANNUITY' adp = 'AMT_DOWN_PAYMENT' cpy = 'CNT_PAYMENT' co_type = 'NAME_CONTRACT_TYPE' dd = 'DAYS_DECISION' #======================================================================== # Previous ApplicationのInterest Rateを計算 #======================================================================== prev_ir = False if prev_ir: app = utils.read_df_pkl('../input/clean_app*')[[key, target]] df = utils.read_df_pkl('../input/clean_prev*') df = df[[key, prev_key, dd, acr, aan, cpy, adp, co_type]].merge(app, on=key, how='inner') df = df[~df[cpy].isnull()] for cnt in range(3, 64, 3): if cnt<=60: ir = ( (df[aan].values * cnt) / df[acr].values ) - 1.0 df[f'ir_{cnt}@'] = ir df[f'ir_{cnt}@'] = df[f'ir_{cnt}@'].map(lambda x: x if (0.08<x) and (x<0.5) else np.nan) print(f"{cnt} :", len(df[f'ir_{cnt}@'].dropna())) if len(df[f'ir_{cnt}@'].dropna())<len(df)*0.001: df.drop(f'ir_{cnt}@', axis=1, inplace=True) continue else: ir = ( (df[aan].values * df[cpy].values) / df[acr].values ) - 1.0
verbose=verbose, callbacks=callbacks, validation_data=te_gen, nb_val_samples=nb_val_samples, max_q_size=10) def predict(self, X, batch_size=128): y_preds = predict_batch(self.model, X, batch_size=batch_size) return y_preds logger.info("Keras Setup Complete!!") #======================================================================== # Data Load base = utils.read_df_pkl('../input/base*') win_path_list = glob.glob(win_path) train_path_list = [] test_path_list = [] for path in win_path_list: if path.count('train'): train_path_list.append(path) elif path.count('test'): test_path_list.append(path) # train_path_list = sorted(train_path_list)[:20] # test_path_list = sorted(test_path_list)[:20] base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) train_feature_list = utils.parallel_load_data(path_list=train_path_list)
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D from keras.optimizers import Adam, SGD from keras.models import Model from keras import backend as K from keras.engine.topology import Layer from keras import initializers, regularizers, constraints, optimizers, layers from keras import callbacks from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau from keras import initializers from keras.engine import InputSpec, Layer from keras import backend as K base = utils.read_df_pkl(path='../input/base_Av*') if is_make: #======================================================================== # Dataset Load with utils.timer('Download Train and Test Data.\n'): train, test = MS_utils.get_dataset(base=base, feat_path='../features/4_winner/*.gz', is_cat_encode=False) nlp_cols = [ 'Engine' ,'OSVersion' ,'AppVersion' ,'AvSigVersion' ,'SkuEdition' ,'SmartScreen' ,'Census_OSArchitecture'
logger = logger_func() except NameError: logger = logger_func() #======================================================================== """ argv[1]: comment argv[2]: feature_key argv[3]: group """ # Columns base_path = '../input/base_exclude*' base_path = '../input/base_Av*' key, target, ignore_list = MS_utils.get_basic_var() ignore_list = [key, target, 'country_group', 'down_flg'] base = utils.read_df_pkl(base_path)[[key, target, 'country_group']] # Basic Args seed = 1208 set_type = 'all' comment = sys.argv[1] if sys.argv[2].count('f'): train, test = MS_utils.get_feature_set(feat_key=sys.argv[2], base_path=base_path) else: train, test = MS_utils.get_dataset(base=base) print(train.shape, test.shape) if is_debug:
def make_fam_dist(base_fam, multi, is_drop=False): # ======================================================================== # Args key = 'card_id' target = 'target' is_viz = False base_year = int(base_fam[:4]) base_month = int(base_fam[-2:]) max_fam = '2017-12' min_fam = '2011-11' result_id = [] # ======================================================================== # ======================================================================== # Data Load base = utils.read_df_pkl('../input/base_first*') base[target] = base[target].map(lambda x: np.round(x, 1)) # ======================================================================== df_list = [] val_cnt = base[target].value_counts() val_cnt.name = 'all' df_list.append(val_cnt.copy()) base_1712 = base[base['first_active_month'] == base_fam] val_cnt = base_1712[target].value_counts() # もう使わないかも? is_max = False if is_max: val_cnt_max = val_cnt.max() val_cnt /= val_cnt_max val_cnt.name = base_fam df_list.append(val_cnt) df = pd.concat(df_list, axis=1) def arange_ratio(df, multi, is_viz=False): df[base_fam] *= multi df['diff'] = df['all'] - df[base_fam] diff_len = len(df[df['diff'] < 0]) if is_viz: display(df[df['diff'] < 0]) if diff_len > limit_diff_num: return -1 return 0 df = pd.concat(df_list, axis=1) # もう使わないかも? target_max = np.max(df.dropna().index.tolist()) target_min = np.min(df.dropna().index.tolist()) cnt_0_fam = df.loc[0.0, :][base_fam] cnt_0_all = df.loc[0.0, :]['all'] # multi = int(cnt_0_all / cnt_0_fam)+1 # while True: # tmp = df.copy().dropna() # is_minus = arange_ratio(tmp, multi) # if is_minus: # multi -= 1 # continue # break print(f"multi: {multi}") df[base_fam] *= multi if is_drop: df_loy = df.dropna() loy_list = list(df_loy.index) else: loy_list = list(np.arange(target_min, target_max, 0.1)) # ======================================================================== # Sampling # ======================================================================== before = 0 for i in loy_list: loy = np.round(i, 1) df_id = base[base[target] == loy] if len(df_id) == 0: continue sample = df.loc[loy, base_fam] if sample == sample: before = sample else: sample = before sample = np.int(sample) remain = sample sampling_id = [] if remain==0: continue if is_viz: print(''' #======================================================================== # Sampling Start!! ''') for i in range(100): is_add = True if i == 0: yyyymm = base_fam tmp_id = df_id[df_id['first_active_month'] == yyyymm] else: year = base_year month = base_month + i if month > 12: num_year = month//12 year = year + num_year month = month - 12 * num_year elif month < 1: num_year = month//12 if num_year == 0: year = year - 1 month = month + 12 else: num_year *= -1 year = year - num_year month = month + 12*num_year if month < 10: month = f'0{month}' yyyymm = f"{year}-{month}" if yyyymm < min_fam or yyyymm > max_fam: is_add = False else: tmp_id = df_id[df_id['first_active_month'] == yyyymm] if i > 0 and yyyymm == base_fam: is_add = False # ======================================================================== # Sampling if is_add: if is_viz: print(f'future yyyymm: {yyyymm}') id_list = list(tmp_id[key].values) if len(id_list) <= remain: sampling_id += id_list else: sampling_id += list(np.random.choice(id_list, remain, replace=False)) if is_viz: print(f"sampling_id: {len(sampling_id)} / {sample}") # ======================================================================== remain = sample - len(sampling_id) if remain <= 0: break is_add = True if i > 0: year = base_year month = base_month - i if month > 12: num_year = month//12 year = year + num_year month = month - 12 * num_year elif month < 1: num_year = month//12 if num_year == 0: year = year - 1 month = month + 12 else: num_year *= -1 year = year - num_year month = month + 12*num_year if month < 10: month = f'0{month}' yyyymm = f"{year}-{month}" if yyyymm < min_fam or yyyymm > max_fam: is_add = False else: tmp_id = df_id[df_id['first_active_month'] == yyyymm] # ======================================================================== # Sampling if is_add: if is_viz: print(f'past yyyymm: {yyyymm}') id_list = list(tmp_id[key].values) if len(id_list) <= remain: sampling_id += id_list else: sampling_id += list(np.random.choice(id_list, remain, replace=False)) if is_viz: print(f"sampling_id: {len(sampling_id)} / {sample}") # ======================================================================== remain = sample - len(sampling_id) if remain <= 0: break result_id += sampling_id if is_viz: print(f"loy:{loy} | {len(sampling_id)}/{sample} | All: {len(result_id)}") print(''' # Sampling Complete!! #======================================================================== ''') print(f"All: {len(result_id)} | Unique: {len(np.unique(result_id))}") print(base[base[key].isin(result_id)] ['first_active_month'].value_counts().head()) print(base[base[key].isin(result_id)]['target'].value_counts().head()) return result_id