def get_oof_feature(oof_path='../oof_feature/*.gz', key='', pred_col='prediction'): feat_path_list = glob.glob(oof_path) oof_list = [] for path in feat_path_list: oof = utils.read_pkl_gzip(path) oof_name = oof.columns.tolist()[1] oof = oof.set_index(key)[pred_col] oof.name = "oof_" + oof_name oof_list.append(oof) df_oof = pd.concat(oof_list, axis=1) return df_oof
dfx = dfx.merge(dd2) dfx['predswo'] = dfx['predswo1']*0.5 + dfx['predswo2']*0.5 dfx['errorwo'] = (dfx['target'] - dfx['predswo']) ** 2 dfx['errorwo1'] = (dfx['target'] - dfx['predswo1']) ** 2 dfx['errorwo2'] = (dfx['target'] - dfx['predswo2']) ** 2 # ft = pd.read_pickle('ft_refminmax5.pkl') # print(ft['type'].value_counts()) # dfx = dfx.merge(ft, how='left') #print(dfx) ft = pd.read_csv('../input/card_ids_grouping.csv')[['card_id','type']] # ft2 = pd.read_csv('0219_go_elo_classifier_pred_NoOutlierFlg.csv')[['card_id','no_out_flg','clf_pred']] ft2 = utils.read_pkl_gzip('../input/base_no_out_clf.gz')[['card_id','no_out_flg','clf_pred']] dfx = dfx.merge(ft, how='left').merge(ft2, how='left') #print(dfx) dfx['targeto'] = dfx['target'].apply(lambda x: 1 if x < -20 else 0) print(dfx.groupby(['no_out_flg','type'])['targeto'].agg(['mean','sum','size']).reset_index()) print('error preds',rmse(dfx['target'],dfx['preds'])) print('error wo1 ',rmse(dfx['target'],dfx['predswo1'])) print('error wo2 ',rmse(dfx['target'],dfx['predswo2'])) print('error wo ',rmse(dfx['target'],dfx['predswo'])) sel = (dfx['type'] == 0) #dfx.loc[sel, 'preds'] = (dfx.loc[sel, 'predsmo']*0.2 + dfx.loc[sel, 'preds']*0.8) sel = (dfx['type'] == 2) dfx.loc[sel, 'preds'] = (dfx.loc[sel, 'predswo']*0.65 + dfx.loc[sel, 'preds']*0.35)
train_feature_path = use_feature10_path + use_feature90_path test_feature_path = [] for path in train_feature_path: test_feature_path.append(path.replace('train', 'test')) train_feature_list = utils.pararell_load_data(path_list=train_feature_path) test_feature_list = utils.pararell_load_data(path_list=test_feature_path) train = pd.concat(train_feature_list, axis=1) train = pd.concat([base_train, train], axis=1) test = pd.concat(test_feature_list, axis=1) test = pd.concat([base_test, test], axis=1) if i % 10 == 0: outlier_pred = utils.read_pkl_gzip( '../stack/1204_211_outlier_classify_lgb_auc0-8952469653357074_227features.gz' ).set_index(key) train['outlier_pred@'] = outlier_pred.loc[train_id, 'prediction'].values test['outlier_pred@'] = outlier_pred.loc[test_id, 'prediction'].values # ===================================================================== # ======================================================================== # LGBM Setting metric = 'rmse' fold = 5 fold_type = 'kfold' group_col_name = '' dummie = 1 oof_flg = True LGBM = lgb_ex(logger=logger,
utils.to_pkl_gzip(obj=df_tfidf, path='./df_tfidf') # Load Text List train_text_list = list(train[qt].values) test_text_list = list(test[qt].values) # 並列処理でクレンジング logger.info("Cleansing Text...") def pararell_cleansing(tx): return cleansing_text(tx) train_text_list = pararell_process(pararell_cleansing, train_text_list) test_text_list = pararell_process(pararell_cleansing, test_text_list) text_list = train_text_list + test_text_list # TFIDF get_tfidf(text_list) df_tfidf = utils.read_pkl_gzip(path='./df_tfidf.gz') if is_svd: from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100, random_state=1208) svd_tfidf = svd.fit_transform(df_tfidf) col_names = [f"{feat_no}_svd100_tfidf100k_{i}@" for i in range(100)] df_svd = pd.DataFrame(svd_tfidf, columns=col_names) train_idx = train.index test_idx = test.index svd_train = df_svd.loc[train_idx, :] svd_test = df_svd.loc[test_idx, :] print(svd_train.shape) print(svd_test.shape)
if is_num: train_word_sequences = np.hstack((train_word_sequences, num_train.values)) test_word_sequences = np.hstack((test_word_sequences, num_test.values)) print( f"Train: {train_word_sequences.shape} | Test: {test_word_sequences.shape}") print(train_word_sequences[:1]) print(test_word_sequences[:1]) #======================================================================== #======================================================================== # Make Validation seed = 1208 fold_n = 5 base = utils.read_df_pkl('../input/base_group*') vi = utils.read_pkl_gzip('../input/f000_AvSigVersion.gz') vi_col = 'f000_AvSigVersion' base[vi_col] = vi base_train = base[~base[target].isnull()] base_test = base[base[target].isnull()] base_train.sort_values(vi_col, inplace=True) if is_debug: base_train = base_train[[key, target]].head(10000) base_test = base_test[[key, target]].head(1000) else: base_train = base_train[[key, target]] base_test = base_test[[key, target]] from sklearn.model_selection import KFold
# length = len(train) # for col in train.columns: # tmp = train[col].dropna().shape[0] # if length - tmp>0: # print(col) # inf_max = train[col].max() # inf_min = train[col].min() # if inf_max==np.inf or inf_min==-np.inf: # print(col, inf_max, inf_min) # #======================================================================== #======================================================================== # CVの準備 fold = 6 kfold = utils.read_pkl_gzip('../input/ods_kfold.gz') use_cols = [col for col in train.columns if col not in ignore_list] scaler = StandardScaler() # なぜか一回目で終わらないことがあるので。。 try: scaler.fit(pd.concat([train[use_cols], test[use_cols]])) except ValueError: inf_col_list = [] for col in use_cols: inf_max = train[col].max() inf_min = train[col].min() if inf_max == np.inf or inf_min == -np.inf: inf_col_list.append(col)
LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) LGBM.seed = seed #======================================================================== if len(path[0]) > 0: train_path = path[0] test_path = path[1] train_feat = utils.get_filename(path=train_path, delimiter='gz') train_feat = train_feat[14:] test_feat = utils.get_filename(path=test_path, delimiter='gz') test_feat = test_feat[13:] train[train_feat] = utils.read_pkl_gzip(train_path) test[train_feat] = utils.read_pkl_gzip(test_path) else: train_feat = 'base' logger.info(f''' #======================================================================== # No: {i}/{len(train_feat_list)} # Valid Feature: {train_feat} #========================================================================''' ) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train,
if debug: train = train.head(10000) test = test.head(1000) for col in train.columns: if col in ignore_list: continue train[col] = utils.impute_feature(df=train, col=col) test[col] = utils.impute_feature(df=test, col=col) return train, test model_no = 0 base = utils.read_pkl_gzip('../input/base_type_group.gz')[[key, target]] base_train, base_test = get_dataset(base, model_no) #======================================================================== # Make Dataset pred_col = 'prediction' valid_type = 'ods' set_type = 'all' #======================================================================== #======================================================================== # CVの準備 fold_seed = 328 fold = 6 #========================================================================
key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term', 'no_out_flg' 'clf_pred' ] # ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term'] use_cols = [col for col in train.columns if col not in ignore_list] scaler = StandardScaler() scaler.fit(pd.concat([train[use_cols], test[use_cols]])) x_test = scaler.transform(test[use_cols]) if out_part == 'no_out': train = train[train[target] > -30] kfold_path = f'../input/kfold_{valid_type}_{out_part}_fold{fold}_seed{fold_seed}.gz' if os.path.exists(kfold_path): kfold = utils.read_pkl_gzip(kfold_path) Y = train[target] # ======================================================================== print(f"Train: {train.shape} | Test: {test.shape}") # ======================================================================== # Model Setting params = {} def select_model(model_type, seed=1208): if model_type == 'ridge': params['solver'] = 'auto' params['fit_intercept'] = True params['alpha'] = 0.4
target = 'TARGET' ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV'] # =========================================================================== # DATA LOAD # =========================================================================== base = utils.read_df_pkl(path='../input/base_app*') fname = 'app' prefix = f'{feat_no}{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p') train = df[~df[target].isnull()] test = df[df[target].isnull()] neighbor = '110_app_neighbor81@' train[neighbor] = utils.read_pkl_gzip('../input/[email protected]') test[neighbor] = utils.read_pkl_gzip('../input/[email protected]') combi = [neighbor, cat] cat_list = get_categorical_features(df=df, ignore_list=ignore_list) #======================================================================== # TARGET ENCODING #======================================================================== for cat in cat_list: combi = cat feat_train, feat_test = target_encoding(logger=logger, train=train, test=test, key=key, level=combi, target=target,
base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list) df_feat = pd.concat(feature_list, axis=1) train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1) test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) train_test = pd.concat([train, test], axis=0) #======================================================================== # card_id list by first active month try: sys.argv[5] train_latest_id_list = np.load('../input/card_id_train_first_active_201712.npy') test_latest_id_list = np.load('../input/card_id_test_first_active_201712.npy') train = train.loc[train[key].isin(train_latest_id_list), :].reset_index(drop=True) test = test.loc[test[key].isin(test_latest_id_list), :].reset_index(drop=True) submit = [] except IndexError: pass #======================================================================== model_list = utils.read_pkl_gzip('../model/201712/0122_elo_first_month201712_10seed_fold_model_list.gz') use_cols = pd.read_csv('../model/201712/0122_elo_first_month201712_fold_model_use_cols.csv').values.reshape(-1,) pred = np.zeros(len(train_test)) for model in model_list: pred += model.predict(train_test[use_cols]) pred /= len(model_list) feature_name = '014_l02_elo_first_month201712_prediction' utils.to_pkl_gzip(obj=pred, path='../features/5_tmp/{feature_name}')
#======================================================================== # Result cv_score = np.mean(cv_list) iter_avg = np.int(np.mean(iter_list)) #======================================================================== logger.info(f''' #======================================================================== # {len(seed_list)}SEED CV SCORE AVG: {cv_score} #========================================================================''') #======================================================================== # Part of card_id Score # bench = pd.read_csv('../input/bench_LB3684_FAM_cv_score.csv') bench = utils.read_pkl_gzip( '../stack/0206_125_stack_lgb_lr0.01_235feats_10seed_70leaves_iter1164_OUT29.8269_CV3-6215750935280235_LB.gz' )[[key, 'pred_mean']].rename(columns={'pred_mean': 'bench_pred'}) df_pred = df_pred.merge(bench, on=key, how='inner') part_score_list = [] part_N_list = [] fam_list = [] base_train['first_active_month'] = base_train['first_active_month'].map( lambda x: str(x)[:7]) for i in range(201501, 201713, 1): fam = str(i)[:4] + '-' + str(i)[-2:] df_part = base_train[base_train['first_active_month'] == fam] if len(df_part) < 1: continue part_id_list = df_part[key].values
#======================================================================== # Data Load win_path = f'../features/4_winner/*.gz' model_path_list = [ f'../model/LB3670_70leaves_colsam0322/*.gz', '../model/E2_lift_set/*.gz', '../model/E3_PCA_set/*.gz', '../model/E4_mix_set/*.gz' ] model_path = model_path_list[model_no] win_path_list = glob.glob(model_path) # win_path_list = glob.glob(model_path) + glob.glob(win_path) + glob.glob('../features/5_tmp/*.gz') base = utils.read_pkl_gzip('../input/base_no_out_clf.gz')[[ key, target, col_term, 'first_active_month', no_flg, 'clf_pred' ]] # base = utils.read_df_pkl('../input/base_term*')[[key, target, col_term, 'first_active_month']] base[col_term] = base[col_term].map( lambda x: 24 if 19 <= x else 18 if 16 <= x and x <= 18 else 15 if 13 <= x and x <= 15 else 12 if 9 <= x and x <= 12 else 8 if 6 <= x and x <= 8 else 5 if x == 5 else 4) # nn_stack_plus = utils.read_pkl_gzip('../ensemble/NN_ensemble/0213_142_elo_NN_stack_E1_row99239_outpart-all_235feat_const1_lr0.001_batch128_epoch30_CV1.2724309982670599.gz')[[key, 'prediction']].set_index(key) # nn_stack_minus = utils.read_pkl_gzip('../ensemble/NN_ensemble/0213_145_elo_NN_stack_E1_row104308_outpart-all_235feat_const1_lr0.001_batch128_epoch30_CV4.864183650939903.gz')[[key, 'prediction']].set_index(key) # base.set_index(key, inplace=True) # base['nn_plus'] = nn_stack_plus['prediction'] # base['nn_minus'] = nn_stack_minus['prediction'] # base.reset_index(inplace=True) base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True)
HOME = os.path.expanduser("~") sys.path.append(f'{HOME}/kaggle/data_analysis/library') import utils path_list = glob.glob('../features/1_first_valid/*.gz') key = '' old_key = '_pts_' new_key = '_pst_' for path in path_list: if (path.count(key)): # if not(path.count(key)): # if not(path.count(key)): feature = utils.read_pkl_gzip(path) rename_path = path.replace(old_key, new_key).replace('.gz', '').replace( '.gz', '').replace('.gz', '') utils.to_pkl_gzip(obj=feature, path=rename_path) os.system(f'rm {path}') else: feature = utils.read_pkl_gzip(path) rename_path = path.replace('.gz', '').replace('.gz', '').replace('.gz', '') utils.to_pkl_gzip(obj=feature, path=rename_path) os.system(f'rm {path}')
train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1) test = pd.concat( [base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) self_predict = train.copy() y = train[target].values #======================================================================== # Outlier Setting if out_part == 'part': # Exclude Difficult Outlier # clf_result = utils.read_pkl_gzip('../stack/0111_145_outlier_classify_9seed_lgb_binary_CV0-9045939277654236_188features.gz')[[key, 'prediction']] clf_result = utils.read_pkl_gzip( '../stack/0130_214_outlier_classify_9seed_lgb_binary_CV0-9044513544501314_172features.gz' )[[key, 'pred_mean']] train = train.merge(clf_result, how='inner', on=key) # tmp1 = train[train.prediction>0.01] # tmp2 = train[train.prediction<0.01][train.target>-30] tmp1 = train[train.pred_mean > 0.01] tmp2 = train[train.pred_mean < 0.01][train.target > -30] train = pd.concat([tmp1, tmp2], axis=0, ignore_index=True) del tmp1, tmp2 gc.collect() # train.drop('prediction', axis=1, inplace=True) train.drop('pred_mean', axis=1, inplace=True) elif out_part == 'all': # Exclude Outlier train = train[train.target > -30]
params['subsample'] = 0.9 params['colsample_bytree'] = 0.3 params['min_child_samples'] = 30 try: colsample_bytree = float(sys.argv[8]) params['colsample_bytree'] = colsample_bytree except IndexError: colsample_bytree = params['colsample_bytree'] start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) #======================================================================== # Data Load base = utils.read_pkl_gzip('../input/base_type_group.gz')[[ key, target, col_term, 'first_active_month', no_flg, 'clf_pred', 'group' ]] gr_col = 'group' # tmp = utils.read_pkl_gzip('../stack/0223_222_stack_future_amount_pred_fold4_leaves16_AUC_CV0.2999066985403468.gz').set_index(key) # tmp2 = utils.read_pkl_gzip('../stack/0223_222_stack_future_amount_pred_fold4_leaves16_CV1897.3342481032632.gz').set_index(key) # base.set_index(key, inplace=True) # base['1_pred'] = tmp['prediction'] # amount_pred_cols = [col for col in tmp2.columns if col.count('pred')] # base[amount_pred_cols] = tmp2[amount_pred_cols] # base.reset_index(inplace=True) base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list)
# =========================================================================== key = 'SK_ID_CURR' target = 'TARGET' ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV'] # =========================================================================== # DATA LOAD # =========================================================================== base = utils.read_df_pkl(path='../input/base_app*') fname = 'app' prefix = feat_no + f'{fname}_' df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')[[ key, target, 'EXT_SOURCE_2' ]] train_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]') test_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]') ir_mean = np.hstack((train_ir, test_ir)) df['stan_ir_mean@'] = ir_mean df['stan_ir_mean@'].fillna('ir_nan', inplace=True) num_split = 9 df['EXT_bin'] = pd.qcut(x=df['EXT_SOURCE_2'], q=num_split) df['ir_bin'] = pd.qcut(x=df['stan_ir_mean@'], q=num_split) col = f'neighbor{num_split**2}@' df[col] = df[['EXT_bin', 'ir_bin']].apply(lambda x: str(x[0]) + '_' + str(x[1]) if str(x[0]) != str(np.nan) else 'ext_nan', axis=1)
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) # Data Load base = utils.read_df_pkl('../input/base_first*') path_list = glob.glob('../ensemble/*.gz') # path = '../stack/0127_120_stack_lgb_lr0.01_349feats_1seed_31leaves_iter3915_OUT0_CV1-139620018388889_LB.gz' path_1 = '../ensemble/0112_123_stack_lgb_lr0.01_200feats_10seed_iter1121_OUT30.2024_CV3-649256498211181_LB3.687.gz' path_2 = '../ensemble/0112_084_stack_lgb_lr0.01_200feats_10seed_OUT30.2199_CV3-649046125233803_LB3.687.gz' #======================================================================== # First Month Group Score # for ratio_1, ratio_2 in zip(np.arange(0.1, 1.0, 0.1), np.arange(0.9, 0.0, -0.1)): base['prediction'] = 0 # filename = re.search(r'/([^/.]*).gz', path.replace('.', '-')).group(1) pred_1 = utils.read_pkl_gzip(path_1) pred_2 = utils.read_pkl_gzip(path_2) base.set_index('card_id', inplace=True) pred_1.set_index('card_id', inplace=True) base['pred_1'] = pred_1['prediction'] base['pred_2'] = pred_2['prediction'] base['prediction'] = (base['pred_1'] + base['pred_2']) / 2 base['prediction'] = base['pred_1'] # base['prediction'] = base['pred_2'] base.reset_index(inplace=True) base = base[~base[target].isnull()] #======================================================================== # Part of card_id Score part_score_list = []
# win_path = f'../model/old_201712/*.gz' try: if not logger: logger = logger_func() except NameError: logger = logger_func() start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) #======================================================================== # Data Load base = utils.read_df_pkl('../input/base_first*') # fm201712_all = utils.read_pkl_gzip('../model/201712/stack/0126_0933_elo_first_month201712_all_dist_all_03_stack_1seed_lr0-02_round75000_CV3-6547.gz') # fm201712_org = utils.read_pkl_gzip('../model/201712/stack/0126_0933_elo_first_month201712_org0_dist_all_03_stack_1seed_lr0-02_round75000_CV3-7252.gz') fm201712 = utils.read_pkl_gzip( '../stack/0127_184_stack_no_lgb_lr0.02_128feats_10seed_31leaves_iter1107_FAM2017-12_FAMS1-16326_CV1-2217668492567508_LB.gz' ).set_index(key) fm201711 = utils.read_pkl_gzip( '../stack/0127_184_stack_no_lgb_lr0.02_150feats_10seed_31leaves_iter994_FAM2017-11_FAMS1-32748_CV1-3777250084934298_LB.gz' ).set_index(key) fm201710 = utils.read_pkl_gzip( '../stack/0127_185_stack_no_lgb_lr0.02_126feats_10seed_31leaves_iter493_FAM2017-10_FAMS1-74594_CV1-7751920449648786_LB.gz' ).set_index(key) #======================================================================== base = base[base[target].isnull()] base.set_index(key, inplace=True) base['pred_17-12'] = fm201712['pred_mean'] base['pred_17-11'] = fm201711['pred_mean'] base['pred_17-10'] = fm201710['pred_mean']
LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) LGBM.seed = seed cv_score_list = [] no_update_cnt = 0 if len(path) > 0: used_path += list(path).copy() valid_feat = utils.get_filename(path=path, delimiter='gz') # 検証するFeatureをデータセットに追加 try: train[valid_feat] = utils.read_pkl_gzip(path)[:len(base_train)] except FileNotFoundError: continue except ValueError: continue else: valid_feat = 'base' path = 'base_path' # idを絞る train.sort_index(axis=1, inplace=True) logger.info(f''' #======================================================================== # No: {i}/{len(valid_feat_list)-1} # Valid Feature: {valid_feat}
pl_length = 0 train_latest_id_list = np.load(f'../input/card_id_train_first_active_2017{fm_feat_pl[:2]}.npy') test_latest_id_list = np.load(f'../input/card_id_test_first_active_2017{fm_feat_pl[:2]}.npy') #======================================================================== # card_id list by first active month try: if int(fm_feat_pl[:2])>0: first_month = f'2017-{fm_feat_pl[:2]}' if fm_feat_pl[-2:]=='pl': pred_path = glob.glob(f'../model/2017{fm_feat_pl[:2]}/stack/*org0_*')[0] pred_col = 'pred' pred_feat = utils.read_pkl_gzip(pred_path) train[pred_col] = pred_feat[:len(train)] train.loc[~train[key].isin(train_latest_id_list), target] = train.loc[~train[key].isin(train_latest_id_list), pred_col] tmp_test = test.copy() tmp_test[target] = pred_feat[len(train):] # first_active_monthが201712より前の場合、学習データセットから未来のfirst_active_monthを除外する if int(fm_feat_pl[:2])<12: base = base[base['first_active_month'] <= f'2017-{fm_feat_pl[:2]}'] train = train.merge(base[key].to_frame(), how='inner', on=key) test = test.merge(base[key].to_frame(), how='inner', on=key) tmp_test = tmp_test.merge(base[key].to_frame(), how='inner', on=key) train = pd.concat([train, tmp_test], axis=0, ignore_index=True).drop(pred_col, axis=1)
path_list = glob.glob('../stack/*.gz') import pickle import datetime start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) blend_path = glob.glob('../ensemble/*.csv') blend_list = [] for path in blend_path: elem = pd.read_csv(path) blend_list.append(elem.copy()) blending = np.zeros(len(elem)) for elem in blend_list: pred = elem['target'] blending += pred blending /= len(blend_list) submit = pd.read_csv('../input/sample_submission.csv') submit['target'] = blending clf = utils.read_pkl_gzip( '../stack/0112_155_outlier_classify_9seed_lgb_binary_CV0-9047260065151934_200features.gz' ) clf = clf.iloc[-len(submit):, ].reset_index(drop=True) submit.loc[clf.prediction > 0.45, 'target'] = -33.1 submit.to_csv( f'../submit/{start_time[4:12]}_elo_{len(blend_list)}blender_outlier_clf0.45_postprocessing.csv', index=False)
key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, oof_flg=oof_flg) else: import lightgbm as lgb from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score # Dataset Setting train_ = utils.read_pkl_gzip('../py/train_tfidf.gz') test_ = utils.read_pkl_gzip('../py/test_tfidf.gz') from scipy.sparse import hstack, csr_matrix y = train[target] prediction = np.array([]) train = hstack((csr_matrix(train.drop(['qid', target], axis=1)), train_)) test = hstack((csr_matrix(test.drop(['qid', target], axis=1)), test_)) ' KFold ' if fold_type == 'stratified': folds = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed) # 1 kfold = folds.split(train, y) for n_fold, (trn_idx, val_idx) in enumerate(kfold): x_train, x_val, y_train, y_val = train_test_split(train,