def make_raw_feature(data, prefix='', select_list=[], ignore_list=[], extension='pkl', path='../features/1_first_valid', word='', logger=False): for tmp_col in data.columns: if tmp_col in ignore_list: continue if len(select_list) > 0: if f'{prefix}{tmp_col}' not in select_list: continue if len(word) > 0: if not (tmp_col.count(word)): continue new_col = tmp_col.replace('/', '_').replace(':', '_').replace( ' ', '_').replace('.', '_').replace('"', '') data.rename(columns={tmp_col: new_col}, inplace=True) if extension.count('npy'): np.save(f'{path}/{prefix}{new_col}.npy', data[new_col].values) elif extension.count('csv'): data[new_col].to_csv(f'{path}/{prefix}{new_col}.csv') elif extension.count('pkl'): utils.to_pkl_gzip(path=f'{path}/{prefix}{new_col}.fp', obj=data[new_col].values)
def get_tfidf(text_list): ''' Explain: TFIDFを出力したいテキストリストを作成する Args: text_list(list): split前のテキストリスト Return: sparse csr_matrix: TFIDF値が入ったスパースな行列 ''' # Get the tfidf logger.info("Calculate TFIDF...") tfidf_vec = TfidfVectorizer( max_features = 100000, min_df=3, max_df=0.8, stop_words="english", analyzer='word', # analyzer='char', strip_accents='unicode', ngram_range=(1,3), use_idf=True, smooth_idf=True, sublinear_tf=True ).fit(text_list) df_tfidf = tfidf_vec.transform(text_list) utils.to_pkl_gzip(obj=df_tfidf, path='./df_tfidf')
def make_cat_features(df, filekey): mkdir_func(f'../features/{filekey}') train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) #======================================================================== # Categorical Feature Encode #======================================================================== # Factorize logger.info("Factorize Start!!") for col in categorical_features: for col in categorical_features: train[f"lbl_{col}@"], indexer = pd.factorize(train[col]) test[f"lbl_{col}@"] = indexer.get_indexer(test[col]) # Count Encoding logger.info("Count Encoding Start!!") for col in categorical_features: train = cnt_encoding(train, col, ignore_list=ignore_list) test = cnt_encoding(test, col, ignore_list=ignore_list) #======================================================================== # Categorical Feature Save #======================================================================== for col in train.columns: logger.info("Saving Features...") if col.count('@'): result_train = train[col].values result_test = test[col].values logger.info(f"COL: {col} | LENGTH: {len(result_train)}") utils.to_pkl_gzip(obj=result_train, path=f'../features/{filekey}/train_{col}') utils.to_pkl_gzip(obj=result_test, path=f'../features/{filekey}/test_{col}')
def make_num_features(df, filekey): mkdir_func(f'../features/{filekey}') # if filekey.count('bur'): df = interact_feature(df, filekey) #======================================================================== # カテゴリの内容別にNumeric Featureを切り出す #======================================================================== num_list = get_numeric_features(df=df, ignore_list=ignore_list) cat_list = get_categorical_features(df=df, ignore_list=[]) # few_list = [] # for cat in tqdm(cat_list): # for val in tqdm(df[cat].drop_duplicates()): # length = len(df[df[cat]==val]) # if length < len(df)*0.002: # few_list.append(val) # continue # for num in num_list: # # pararell_process(, num_list) # df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan) # df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan) logger.info(f'{fname} SET SHAPE : {df.shape}') #======================================================================== # Feature Save & Categorical Encoding & Feature Save #======================================================================== train = df[~df[target].isnull()] test = df[df[target].isnull()] categorical_features = get_categorical_features(df=train, ignore_list=ignore_list) #======================================================================== # Numeric Feature Save #======================================================================== for col in train.columns: if col in categorical_features:continue result_train = train[col].values result_test = test[col].values logger.info(f"COL: {col} | LENGTH: {len(result_train)}") utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}') if col != target: utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
def one_base_agg(df, prefix): # ======================================================================= # 集計するカラムリストを用意 # ======================================================================= num_list = get_numeric_features(df=df, ignore=ignore_list) # 並列処理→DFが重いと回らないかも # arg_list = [] # for num in num_list: # for method in method_list: # tmp = df[[key, num]] # arg_list.append([tmp, key, num, method, prefix, '', base]) # ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' # call_list = pararell_process(base_agg_wrapper, arg_list) # result = pd.concat(call_list, axis=1) # for col in result.columns: # if not(col.count('@')) or col in ignore_list: # continue # # utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values) # sys.exit() # 直列処理 for num in num_list: for method in method_list: tmp = df[[key, num]] tmp_result = base_aggregation(df=tmp, level=key, method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=key, how='left') renu = result[result[target].isnull()] for col in result.columns: if not (col.count('@')) or col in ignore_list: continue if exclude_feature(col, result[col].values): continue if exclude_feature(col, renu[col].values): continue file_path = f"{dir}/{col}.fp" # utils.to_pickle(path=file_path, obj=result[col].values) utils.to_pkl_gzip(obj=result[col].values, path=file_path) del result, renu, tmp_result gc.collect()
def multi_level_agg(df, prefix): # ======================================================================= # 複数カテゴリの組み合わせを集計値に置き換える # ======================================================================= method_list = ['mean'] num_list = ['EXT_SOURCE_2'] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) cat_combi = combinations(cat_list, 2) # amt_list = [col for col in num_list if col.count('AMT_')] # days_list = [col for col in num_list if col.count('DAYS_')] # 直列処理 for com in cat_combi: for num in num_list: for method in method_list: base = df[[key, target] + list(com)].drop_duplicates() tmp = df[list(com)+[num]] tmp_result = base_aggregation( df=tmp, level=list(com), method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=list(com), how='left') for col in result.columns: if not(col.count('@')) or col in ignore_list: continue train_feat = result[result[target]>=0][col].values test_feat = result[result[target].isnull()][col].values col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_') train_file_path = f"../features/1_first_valid/train_{col}" test_file_path = f"../features/1_first_valid/test_{col}" utils.to_pkl_gzip(obj=train_feat, path=train_file_path) utils.to_pkl_gzip(obj=test_feat, path=test_file_path) logger.info(f''' #======================================================================== # COMPLETE MAKE FEATURE : {train_file_path} #========================================================================''') del result, tmp_result gc.collect()
def single_level_agg(df, prefix): # ======================================================================= # 1つのカテゴリを集計値に置き換える # ======================================================================= method_list = ['mean', 'var'] num_list = ['EXT_SOURCE_2'] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) # amt_list = [col for col in num_list if col.count('AMT_')] # days_list = [col for col in num_list if col.count('DAYS_')] # 直列処理 for cat in cat_list: if len(df[cat].unique())<=3: continue for num in num_list: for method in method_list: base = df[[key, cat, target]].drop_duplicates() tmp = df[[cat, num]] tmp_result = base_aggregation( df=tmp, level=cat, method=method, prefix=prefix, feature=num) result = base.merge(tmp_result, on=cat, how='left') for col in result.columns: if not(col.count('@')) or col in ignore_list: continue train_file_path = f"../features/1_first_valid/train_{col}" test_file_path = f"../features/1_first_valid/test_{col}" utils.to_pkl_gzip(obj=result[result[target]>=0][col].values, path=train_file_path) utils.to_pkl_gzip(obj=result[result[target].isnull()][col].values, path=test_file_path) logger.info(f''' #======================================================================== # COMPLETE MAKE FEATURE : {train_file_path} #========================================================================''') del result, tmp_result gc.collect()
print(""" # ============================================================================= """) # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT) os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt') os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/') else: SUBMIT_FILE_PATH = SUBMIT_FILE_PATH.replace('.csv.gz', f'_{SEED}.pkl') utils.to_pkl_gzip(sub[['HasDetections']], SUBMIT_FILE_PATH) SUBMIT_FILE_PATH += '.gz' os.system(f'gsutil cp {SUBMIT_FILE_PATH} gs://malware_onodera/') os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt') os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/') """ gsutil cp gs://malware_onodera/*.gz ../output/ gsutil cp gs://malware_onodera/*.txt LOG/ """ #============================================================================== utils.end(__file__) #utils.stop_instance()
prediction += test_pred y_pred = model.predict(X=x_val, batch_size=batch_size) stack_prediction[val_idx] = y_pred sc_score = roc_auc_score(y_val, y_pred) logger.info(f''' #======================================================================== # FOLD {n_fold} SCORE: {sc_score} #========================================================================''' ) cv_list.append(sc_score) prediction /= len(kfold) cv_score = np.mean(cv_list) logger.info(f''' #======================================================================== # CV SCORE: {cv_score} #========================================================================''') train_pred = pd.Series(stack_prediction, name='prediction').to_frame() test_pred = pd.Series(prediction, name='prediction').to_frame() train_pred[key] = list(train.index) test_pred[key] = list(test.index) df_pred = pd.concat([train_pred, test_pred], axis=0) utils.to_pkl_gzip( path= f"../stack/{start_time[4:12]}_stack_{model_type}_lr{learning_rate}_{len(num_list)}feats_{len(seed_list)}seed_{batch_size/gpu_count}batch_OUT_CV{str(cv_score).replace('.', '-')}_LB", obj=df_pred)
# Scoring err = (y_val - y_pred) score = np.sqrt(mean_squared_error(y_val, y_pred)) print(f'RMSE: {score} | SUM ERROR: {err.sum()}') score_list.append(score) #======================================================================== cv_score = np.mean(score_list) logger.info(f''' #======================================================================== # CV SCORE AVG: {cv_score} #========================================================================''') #======================================================================== # Stacking test_pred /= fold test['prediction'] = test_pred stack_test = test[[key, 'prediction']] result_list.append(stack_test) df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1) df_pred = base.merge(df_pred, how='inner', on=key) print(f"Stacking Shape: {df_pred.shape}") utils.to_pkl_gzip( obj=df_pred, path= f'../stack/{start_time[4:12]}_elo_NN_stack_linear{is_linear*1}_{len(use_cols)}feat_lr{learning_rate}_batch{batch_size}_epoch{N_EPOCHS}_CV{cv_score}' ) #========================================================================
if path.count('year'): print(np.unique(df['year'].values)) sys.exit() for year in np.unique(df['year'].values): base = utils.read_df_pkl('../input/base0*') base = base.merge(df.query(f"year=={year}"), how='left', on='card_id') train = base[~base['target'].isnull()] test = base[base['target'].isnull()] for col in df.columns: if col.count('__'): utils.to_pkl_gzip( path= f"../features/1_first_valid/{feat_no}train_{col.replace('__', '@').replace('his_', '')}_year{year}", obj=train[col].values) utils.to_pkl_gzip( path= f"../features/1_first_valid/{feat_no}test_{col.replace('__', '@').replace('his_', '')}_year{year}", obj=test[col].values) else: if path.count('dow') and path.count('timezone'): for month in np.unique(df['latest_month_no'].values): for dow in np.unique(df['dow'].values): for timezone in np.unique(df['timezone'].values): base = utils.read_df_pkl('../input/base0*') base = base.merge(
base_train = base[~base[target].isnull()] base_test = base[base[target].isnull()] scaler = StandardScaler() scaler.fit(train_test[use_cols]) x_test = scaler.transform(test[use_cols]) #======================================================================== # df = scaler.transform(train_test[use_cols]) del train_test gc.collect() for num, col in enumerate(tqdm(use_cols)): feature = df[:, num] utils.to_pkl_gzip(obj=feature, path=f'../features/2_second_valid/stan_{col}') sys.exit() #======================================================================== del train_test gc.collect() else: base = pd.concat([train[[key, target, 'country_group']], test[[key, target, 'country_group']] ], axis=0) base_train = base[~base[target].isnull()] use_cols = [col for col in train.columns if col not in ignore_list] x_test = test[use_cols] Y = train[target] print(f"Train: {train.shape} | Test: {test.shape}") # ========================================================================
# train.reset_index(inplace=True) # out_ids = train.loc[train.target<-30, key].values # out_val = train.loc[train.target<-30, target].values # if len(seed_list)==1: # out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values # else: # out_pred = df_pred[df_pred[key].isin(out_ids)]['pred_mean'].values # out_score = np.sqrt(mean_squared_error(out_val, out_pred)) # else: # out_score = 0 # else: # out_score = 0 # Save utils.to_pkl_gzip( path= f"../stack/{start_time[4:12]}_stack_{model_type}_lr{learning_rate}_{feature_num}feats_multi{multi}_val{sys.argv[4]}_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_TERM{base_term}_CV{str(cv_score).replace('.', '-')}_LB", obj=df_pred) # 不要なカラムを削除 drop_feim_cols = [ col for col in cv_feim.columns if col.count('importance_') or col.count('rank_') ] cv_feim.drop(drop_feim_cols, axis=1, inplace=True) drop_feim_cols = [ col for col in cv_feim.columns if col.count('importance') and not (col.count('avg')) ] cv_feim.drop(drop_feim_cols, axis=1, inplace=True) cv_feim.to_csv( f'../valid/{start_time[4:12]}_valid_{model_type}_lr{learning_rate}_{feature_num}feats_multi{multi}_val{sys.argv[4]}_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_TERM{base_term}_CV{cv_score}_LB.csv',
# utils.to_pkl_gzip(obj=base_test[[key, 'prediction']], path=save_path) #======================================================================== cv_score = np.mean(score_list) logger.info(f''' #======================================================================== # CV SCORE AVG: {cv_score} #========================================================================''') test_pred /= fold_no + 1 base_train['prediction'] = oof_pred base_test['prediction'] = test_pred #======================================================================== # Stacking if is_oof: df_stack = pd.concat([base_train, base_test], axis=0, ignore_index=True) print(f"DF Stack Shape: {df_stack.shape}") #======================================================================== if is_debug: sys.exit() utils.to_pkl_gzip( obj=df_stack, path= f'../stack/{start_time}_NN_NLP_{comment}_feat{X_train.shape[1]}_fold{fold_n}_CV{cv_score}_LB' )
score, tmp_oof, tmp_pred, feim, _ = ml_utils.Classifier( model_type=model_type, x_train=x_train, y_train=y_train, x_val=x_val, y_val=y_val, x_test=x_test, params=params, seed=seed, get_score=metric, get_feim=get_feim) logger.info(f"Fold{num_fold} CV: {score}") score_list.append(score) oof_pred[val_idx] = tmp_oof y_test += tmp_pred y_test /= len(get_fold_list) pred_col = 'prediction' base[pred_col] = np.hstack((oof_pred, y_test)) base = base[[key, pred_col]] #======================================================================== # Saving utils.to_pkl_gzip( obj=base, path= f'../output/{start_time[4:12]}_stack_{model_type}_FOLD-{get_fold}_feat{n_features}_seed{seed}_{comment}' ) #========================================================================
# return pararell_write_lda_feat(*args) # def pararell_write_lda_feat(i, bow): # tmp = np.zeros(topics+1) # === topic = lda.get_document_topics(bow) for (tp_no, prob) in topic: mx[i][tp_no] = prob # Pararell # tmp[tp_no] = prob # tmp[topics+1] = i # return tmp # p_list = pararell_process(pararell_wrapper, arg_list) cols = [f"{feat_no}_topic{i}@" for i in range(20)] df_lda = pd.DataFrame(mx, columns=cols) train_idx = train.index test_idx = test.index lda_train = df_lda.loc[train_idx, :] lda_test = df_lda.loc[test_idx, :] #======================================================================== # Save Feature #======================================================================== logger.info("Save Features...") for col in lda_train.columns: utils.to_pkl_gzip(obj=lda_train[col].values, path=f'../features/1_first_valid/train_{col}') utils.to_pkl_gzip(obj=lda_test[col].values, path=f'../features/1_first_valid/test_{col}')
#======================================================================== cv_score = np.mean(score_list) #======================================================================== # Stacking test_pred /= fold_no + 1 test['prediction'] = test_pred stack_test = test[[key, 'prediction']] result_list.append(stack_test) df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1) if key not in base: base.reset_index(inplace=True) df_pred = base[[key, target]].merge(df_pred, how='inner', on=key) print(f''' # ===================================================================== # SCORE AVG: {cv_score} # =====================================================================''') #======================================================================== # Save Stack feature = df_pred['prediction'].values utils.to_pkl_gzip( path= f"../features/1_first_valid/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}_seed{fold_seed}_feat{len(use_cols)}_CV{cv_score}_LB", obj=feature) #========================================================================
#======================================================================== cv_score = LGBM.cv_score test_pred = LGBM.prediction cv_feim = LGBM.cv_feim feature_num = len(LGBM.use_cols) cv_feim.to_csv( f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv', index=False) #======================================================================== # STACKING if len(stack_name) > 0: logger.info(f'result_stack shape: {LGBM.result_stack.shape}') utils.to_pkl_gzip( path= f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features", obj=LGBM.result_stack) logger.info( f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_{metric}_{cv_score}.csv' ) #======================================================================== #======================================================================== # Submission if len(submit) > 0: submit[target] = test_pred submit.to_csv( f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False) #========================================================================
if len(stack_name) > 0: logger.info(f'result_stack shape: {df_pred.shape}') if len(seed_list) > 1: pred_cols = [col for col in df_pred.columns if col.count('predict')] df_pred['pred_mean'] = df_pred[pred_cols].mean(axis=1) df_pred['pred_std'] = df_pred[pred_cols].std(axis=1) #======================================================================== #======================================================================== # Result cv_score = np.mean(cv_list) iter_avg = np.int(np.mean(iter_list)) #======================================================================== logger.info(f''' #======================================================================== # {len(seed_list)}SEED CV SCORE AVG: {cv_score} #========================================================================''') # Save try: if int(sys.argv[2]) == 0: utils.to_pkl_gzip( path= f"../stack/{start_time[4:12]}_stack_pred_{stack_name}_lr{learning_rate}_{feature_num}feats_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_CV{str(cv_score).replace('.', '-')}", obj=df_pred) except ValueError: pass except TypeError: pass
#======================================================================== # CV SCORE AVG: {cv_score} #========================================================================''') #======================================================================== # Stacking test_pred /= fold test['prediction'] = test_pred stack_test = test[[key, 'prediction']] result_list.append(stack_test) df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target, axis=1) df_pred = base.merge(df_pred, how='inner', on=key) print(f"Stacking Shape: {df_pred.shape}") utils.to_pkl_gzip(obj=df_pred, path=f'../output/{start_time[4:11]}_elo_NN_stack_CV{score}') #======================================================================== sys.exit() #======================================================================== # Part of card_id Score bench = pd.read_csv('../input/bench_LB3684_FAM_cv_score.csv') part_score_list = [] part_N_list = [] fam_list = [] # for i in range(201101, 201713, 1): for i in range(201501, 201713, 1): fam = str(i)[:4] + '-' + str(i)[-2:] df_part = base_train[base_train['first_active_month'] == fam] if len(df_part) < 1:
test_pred += np.squeeze(model.predict(x_test)) #======================================================================== #======================================================================== # Scorring score = roc_auc_score(y_val, y_pred) print(f'AUC: {score}') score_list.append(score) #======================================================================== cv_score = np.mean(score_list) logger.info(f''' #======================================================================== # CV SCORE AVG: {cv_score} #========================================================================''') test_pred /= num_ + 1 base_train['prediction'] = oof_pred base_test['prediction'] = test_pred #======================================================================== # Stacking df_stack = pd.concat([base_train, test], axis=0, ignore_index=True) print(f"DF Stack Shape: {df_stack.shape}") utils.to_pkl_gzip( obj=df_stack[[key, 'prediction']], path= f'../stack/{start_time[4:12]}_MS_stack_NN_E{set_no+1}_batch{batch_size}_epoch{N_EPOCHS}_CV{cv_score}' ) #========================================================================
df['dima_ir_max@'] = df[ir_cols].max(axis=1) df['dima_ir_min@'] = df[ir_cols].min(axis=1) ir_cols = [col for col in df.columns if col.count('dima') and col.count('ir')] # CNT_PAYMENT系->過学習してるっぽい? # df['dima_Pred_CPY_diff_lengthX@'] = df['CNT_PAYMENT'].values - df['dima_lengthX@'].values # df['dima_Cal_CPY_diff_lengthX@'] = df['dima_lengthX@'].values - (df['AMT_CREDIT'].values / df['AMT_ANNUITY'].values) # train_file_path = f"../features/1_first_valid/train_{cpy}" # test_file_path = f"../features/1_first_valid/test_{cpy}" # Feature Save for col in ir_cols: if not(col.count('@')) or col in ignore_list: continue if not(col.count('ir_3@')) and not(col.count('ir_6@')) and not(col.count('ir_9@')): continue train_feat = df[df[target]>=0][col].values test_feat = df[df[target].isnull()][col].values col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_') train_file_path = f"../features/1_first_valid/train_{col}" test_file_path = f"../features/1_first_valid/test_{col}" utils.to_pkl_gzip(obj=train_feat, path=train_file_path) utils.to_pkl_gzip(obj=test_feat, path=test_file_path) logger.info(f''' #======================================================================== # COMPLETE MAKE FEATURE : {train_file_path} #========================================================================''')
print('with mybest:', sub['HasDetections'].corr(sub_best['HasDetections'], method='spearman')) print(""" # ============================================================================= # write down these info to benchmark.xlsx # ============================================================================= """) [print(f'{k:<25}: {RESULT_DICT[k]}') for k in RESULT_DICT] print(""" # ============================================================================= """) # save #sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') utils.to_pkl_gzip(sub[['HasDetections']], SUBMIT_FILE_PATH.replace('.csv.gz', f'_{SEED}.pkl')) # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT) #============================================================================== utils.end(__file__) #utils.stop_instance()
# card_id listにする trn_list = [] val_list = [] for trn, val in kfold: trn_ids = train.iloc[trn][key].values val_ids = train.iloc[val][key].values trn_list.append(trn_ids) val_list.append(val_ids) kfold = [trn_list, val_list] # else: # kfold = False # fold_type = 'kfold' #======================================================================== if not (os.path.exists(kfold_path)): utils.to_pkl_gzip(obj=kfold, path=kfold_path) train.sort_index(axis=1, inplace=True) test.sort_index(axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train, test=test, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params,
# ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) # train tr = utils.load_train(['object_id']) df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_train_v3_20181215.pkl.gz') df = pd.merge(tr, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df) del df['object_id'] df.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl') # test te = utils.load_test(['object_id']) df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_test_v3_20181215.pkl.gz') df = pd.merge(te, df, on='object_id', how='left') df.reset_index(drop=True, inplace=True) get_feature(df) del df['object_id'] df = df.add_prefix(PREF+'_') utils.to_pkl_gzip(df, f'../data/test_{PREF}.pkl') utils.end(__file__)
for path in path_list: # fname = 'his_' + re.search(r'his_([^/.]*).csv', path).group(1) fname = re.search(r'feat([^/.]*)_auth', path).group(1) feat_no = f"{fname}_au1_" df = pd.read_csv(path) base = utils.read_df_pkl('../input/base0*') base = base.merge(df, how='left', on='card_id') train = base[~base['target'].isnull()] test = base[base['target'].isnull()] for col in df.columns: if col.count('__'): utils.to_pkl_gzip( path= f"../features/1_first_valid/{feat_no}train_{col.replace('__', '@').replace('his_', '')}@", obj=train[col].values) utils.to_pkl_gzip( path= f"../features/1_first_valid/{feat_no}test_{col.replace('__', '@').replace('his_', '')}@", obj=test[col].values) else: utils.to_pkl_gzip( path=f"../features/1_first_valid/{feat_no}train_{col}@", obj=train[col].values) utils.to_pkl_gzip( path=f"../features/1_first_valid/{feat_no}test_{col}@", obj=test[col].values)
# score, tmp_oof, tmp_pred, feim = ml_utils.Classifier( score, tmp_oof, tmp_pred, feim, model = ml_utils.Regressor( model_type=model_type, x_train=x_train, y_train=y_train, x_val=x_val, y_val=y_val, x_test=x_test, params=params, seed=seed, get_score=metric, get_model=get_model) if get_model: utils.to_pkl_gzip( obj=model, path= f'../model/{start_time[4:11]}_{comment}_{target}_{model_type}_fold{num_fold}_feat{len(use_cols)}_{metric}-{score}' ) del model gc.collect() feim_list.append( feim.set_index('feature').rename( columns={'importance': f'imp_{num_fold}'})) logger.info(f"Fold{num_fold} CV: {score}") score_list.append(score) oof_pred[val_idx] = tmp_oof y_test += tmp_pred feim = pd.concat(feim_list, axis=1)
score, tmp_oof, tmp_pred, feim, _ = ml_utils.Classifier( model_type=model_type, x_train=x_train, y_train=y_train, x_val=x_val, y_val=y_val, x_test=x_test, params=params, seed=seed, get_score=metric) feim_list.append( feim.set_index('feature').rename( columns={'importance': f'imp_{num_fold}'})) logger.info(f"Fold{num_fold} CV: {score}") score_list.append(score) oof_pred[val_idx] = tmp_oof y_test += tmp_pred pred_col = 'prediction' base[pred_col] = np.hstack((oof_pred, y_test)) base = base[[key, pred_col]] #======================================================================== # Saving utils.to_pkl_gzip( obj=base, path= f'../output/{start_time[4:12]}_stack_{model_type}_FOLD-{get_fold}_feat{len(x_train.columns)}_{comment}' ) #========================================================================
how='inner') df_stack = pd.concat([train, test], ignore_index=True, axis=0) print(f"After Stack Shape: {df_stack.shape}") y_train = train[target].values y_pred = train[pred_col].values from sklearn.metrics import roc_auc_score cv_score = roc_auc_score(y_train, y_pred) logger.info(f''' #======================================================================== # CV: {cv_score} #========================================================================''') #======================================================================== # Saving feim.to_csv( f'../valid/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB.csv', index=True) utils.to_pkl_gzip( obj=df_stack, path= f'../stack/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB' ) submit = pd.read_csv('../input/sample_submission.csv').set_index(key) submit[target] = test[pred_col].values submit.to_csv( f'../submit/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB.csv', index=True) #========================================================================
# Test内で標準化 test =df[df[target].isnull()] test['bur_bin'] = 'test' df = pd.concat([train, test], axis=0).sort_index() #======================================================================== # Current ApplicationのInterest Rateを計算 #======================================================================== # CNT_PAYMENT file_path = f"../features/1_first_valid/" <<<<<<< HEAD # Current Application CNT_PAYMENT Save as Feature utils.to_pkl_gzip(obj=df[~df[target].isnull()][cpy].values, path=train_file_path) utils.to_pkl_gzip(obj=df[df[target].isnull()][cpy].values, path=test_file_path) utils.to_pkl_gzip(obj=df[~df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=train_file_path) utils.to_pkl_gzip(obj=df[df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=test_file_path) ======= # Current Application CNT_PAYMENT Save as Feature # utils.to_pkl_gzip(obj=df[~df[target].isnull()][cpy].values, path=file_path+f'train_{cpy}@') # utils.to_pkl_gzip(obj=df[df[target].isnull()][cpy].values, path=file_path +f'test_{cpy}@') # utils.to_pkl_gzip(obj=df[~df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=file_path+'train_Pred_CPY_diff_Cal_CPY@') # utils.to_pkl_gzip(obj=df[df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=file_path+'test_Pred_CPY_diff_Cal_CPY@') # sys.exit() >>>>>>> 0e2043f2852717c0cf66a8e72ef2fe8f222d4e5e # 金利が何回分の支払いに対して発生しているか不明なので、3回刻みで一通り作る for cnt in range(9, 40, 3): # for cnt in range(27, 46, 3):