Example #1
0
def get_oof_feature(oof_path='../oof_feature/*.gz', key='', pred_col='prediction'):
    feat_path_list = glob.glob(oof_path)
    oof_list = []
    for path in feat_path_list:
        oof = utils.read_pkl_gzip(path)
        oof_name = oof.columns.tolist()[1]
        oof = oof.set_index(key)[pred_col]
        oof.name = "oof_" + oof_name
        oof_list.append(oof)
    df_oof = pd.concat(oof_list, axis=1)
    return df_oof
Example #2
0
dfx = dfx.merge(dd2)

dfx['predswo'] = dfx['predswo1']*0.5 + dfx['predswo2']*0.5

dfx['errorwo'] = (dfx['target'] - dfx['predswo']) ** 2
dfx['errorwo1'] = (dfx['target'] - dfx['predswo1']) ** 2
dfx['errorwo2'] = (dfx['target'] - dfx['predswo2']) ** 2

#  ft = pd.read_pickle('ft_refminmax5.pkl')
#  print(ft['type'].value_counts())
#  dfx = dfx.merge(ft, how='left')
#print(dfx)

ft = pd.read_csv('../input/card_ids_grouping.csv')[['card_id','type']]
#  ft2 = pd.read_csv('0219_go_elo_classifier_pred_NoOutlierFlg.csv')[['card_id','no_out_flg','clf_pred']]
ft2 = utils.read_pkl_gzip('../input/base_no_out_clf.gz')[['card_id','no_out_flg','clf_pred']]
dfx = dfx.merge(ft, how='left').merge(ft2, how='left')
#print(dfx)

dfx['targeto'] = dfx['target'].apply(lambda x: 1 if x < -20 else 0)
print(dfx.groupby(['no_out_flg','type'])['targeto'].agg(['mean','sum','size']).reset_index())

print('error preds',rmse(dfx['target'],dfx['preds']))
print('error wo1   ',rmse(dfx['target'],dfx['predswo1']))
print('error wo2   ',rmse(dfx['target'],dfx['predswo2']))
print('error wo   ',rmse(dfx['target'],dfx['predswo']))

sel = (dfx['type'] == 0)
#dfx.loc[sel, 'preds'] = (dfx.loc[sel, 'predsmo']*0.2 + dfx.loc[sel, 'preds']*0.8)
sel = (dfx['type'] == 2)
dfx.loc[sel, 'preds'] = (dfx.loc[sel, 'predswo']*0.65 + dfx.loc[sel, 'preds']*0.35)
train_feature_path = use_feature10_path + use_feature90_path

test_feature_path = []
for path in train_feature_path:
    test_feature_path.append(path.replace('train', 'test'))

train_feature_list = utils.pararell_load_data(path_list=train_feature_path)
test_feature_list = utils.pararell_load_data(path_list=test_feature_path)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)

if i % 10 == 0:
    outlier_pred = utils.read_pkl_gzip(
        '../stack/1204_211_outlier_classify_lgb_auc0-8952469653357074_227features.gz'
    ).set_index(key)
    train['outlier_pred@'] = outlier_pred.loc[train_id, 'prediction'].values
    test['outlier_pred@'] = outlier_pred.loc[test_id, 'prediction'].values

# =====================================================================

# ========================================================================
# LGBM Setting
metric = 'rmse'
fold = 5
fold_type = 'kfold'
group_col_name = ''
dummie = 1
oof_flg = True
LGBM = lgb_ex(logger=logger,
Example #4
0
    utils.to_pkl_gzip(obj=df_tfidf, path='./df_tfidf')

# Load Text List
train_text_list = list(train[qt].values)
test_text_list = list(test[qt].values)

# 並列処理でクレンジング
logger.info("Cleansing Text...")
def pararell_cleansing(tx):
    return cleansing_text(tx)
train_text_list = pararell_process(pararell_cleansing, train_text_list)
test_text_list = pararell_process(pararell_cleansing, test_text_list)
text_list = train_text_list + test_text_list
# TFIDF
get_tfidf(text_list)
df_tfidf = utils.read_pkl_gzip(path='./df_tfidf.gz')


if is_svd:
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=100, random_state=1208)
    svd_tfidf = svd.fit_transform(df_tfidf)
    col_names = [f"{feat_no}_svd100_tfidf100k_{i}@" for i in range(100)]
    df_svd = pd.DataFrame(svd_tfidf, columns=col_names)
    train_idx = train.index
    test_idx = test.index
    svd_train = df_svd.loc[train_idx, :]
    svd_test = df_svd.loc[test_idx, :]
    print(svd_train.shape)
    print(svd_test.shape)
Example #5
0
if is_num:
    train_word_sequences = np.hstack((train_word_sequences, num_train.values))
    test_word_sequences = np.hstack((test_word_sequences, num_test.values))

print(
    f"Train: {train_word_sequences.shape} | Test: {test_word_sequences.shape}")
print(train_word_sequences[:1])
print(test_word_sequences[:1])
#========================================================================

#========================================================================
# Make Validation
seed = 1208
fold_n = 5
base = utils.read_df_pkl('../input/base_group*')
vi = utils.read_pkl_gzip('../input/f000_AvSigVersion.gz')
vi_col = 'f000_AvSigVersion'
base[vi_col] = vi
base_train = base[~base[target].isnull()]
base_test = base[base[target].isnull()]

base_train.sort_values(vi_col, inplace=True)

if is_debug:
    base_train = base_train[[key, target]].head(10000)
    base_test = base_test[[key, target]].head(1000)
else:
    base_train = base_train[[key, target]]
    base_test = base_test[[key, target]]

from sklearn.model_selection import KFold
# length = len(train)
# for col in train.columns:
#     tmp = train[col].dropna().shape[0]
#     if length - tmp>0:
#         print(col)

#     inf_max = train[col].max()
#     inf_min = train[col].min()
#     if inf_max==np.inf or inf_min==-np.inf:
#         print(col, inf_max, inf_min)
# #========================================================================

#========================================================================
# CVの準備
fold = 6
kfold = utils.read_pkl_gzip('../input/ods_kfold.gz')
use_cols = [col for col in train.columns if col not in ignore_list]
scaler = StandardScaler()

# なぜか一回目で終わらないことがあるので。。
try:
    scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
except ValueError:
    inf_col_list = []
    for col in use_cols:

        inf_max = train[col].max()
        inf_min = train[col].min()
        if inf_max == np.inf or inf_min == -np.inf:
            inf_col_list.append(col)
    LGBM = lgb_ex(logger=logger,
                  metric=metric,
                  model_type=model_type,
                  ignore_list=ignore_list)
    LGBM.seed = seed
    #========================================================================

    if len(path[0]) > 0:
        train_path = path[0]
        test_path = path[1]
        train_feat = utils.get_filename(path=train_path, delimiter='gz')
        train_feat = train_feat[14:]
        test_feat = utils.get_filename(path=test_path, delimiter='gz')
        test_feat = test_feat[13:]

        train[train_feat] = utils.read_pkl_gzip(train_path)
        test[train_feat] = utils.read_pkl_gzip(test_path)
    else:
        train_feat = 'base'

    logger.info(f'''
    #========================================================================
    # No: {i}/{len(train_feat_list)}
    # Valid Feature: {train_feat}
    #========================================================================'''
                )

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_prediction(train=train,
Example #8
0
    if debug:
        train = train.head(10000)
        test = test.head(1000)

    for col in train.columns:
        if col in ignore_list:
            continue
        train[col] = utils.impute_feature(df=train, col=col)
        test[col] = utils.impute_feature(df=test, col=col)

    return train, test


model_no = 0
base = utils.read_pkl_gzip('../input/base_type_group.gz')[[key, target]]
base_train, base_test = get_dataset(base, model_no)

#========================================================================
# Make Dataset
pred_col = 'prediction'
valid_type = 'ods'
set_type = 'all'
#========================================================================

#========================================================================
# CVの準備
fold_seed = 328
fold = 6

#========================================================================
    key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term',
    'no_out_flg'
    'clf_pred'
]
# ignore_list = [key, target, 'merchant_id', 'first_active_month', 'index', 'personal_term']
use_cols = [col for col in train.columns if col not in ignore_list]
scaler = StandardScaler()
scaler.fit(pd.concat([train[use_cols], test[use_cols]]))
x_test = scaler.transform(test[use_cols])

if out_part == 'no_out':
    train = train[train[target] > -30]

kfold_path = f'../input/kfold_{valid_type}_{out_part}_fold{fold}_seed{fold_seed}.gz'
if os.path.exists(kfold_path):
    kfold = utils.read_pkl_gzip(kfold_path)
Y = train[target]
# ========================================================================

print(f"Train: {train.shape} | Test: {test.shape}")

# ========================================================================
# Model Setting
params = {}


def select_model(model_type, seed=1208):
    if model_type == 'ridge':
        params['solver'] = 'auto'
        params['fit_intercept'] = True
        params['alpha'] = 0.4
Example #10
0
target = 'TARGET'
ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV']

# ===========================================================================
# DATA LOAD
# ===========================================================================
base = utils.read_df_pkl(path='../input/base_app*')
fname = 'app'
prefix = f'{feat_no}{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')

train = df[~df[target].isnull()]
test = df[df[target].isnull()]

neighbor = '110_app_neighbor81@'
train[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
test[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
combi = [neighbor, cat]
cat_list = get_categorical_features(df=df, ignore_list=ignore_list)

#========================================================================
# TARGET ENCODING
#========================================================================
for cat in cat_list:
    combi = cat
    feat_train, feat_test = target_encoding(logger=logger,
                                            train=train,
                                            test=test,
                                            key=key,
                                            level=combi,
                                            target=target,
base_test = base[base[target].isnull()].reset_index(drop=True)
feature_list = utils.parallel_load_data(path_list=win_path_list)
df_feat = pd.concat(feature_list, axis=1)
train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1)
test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)
train_test = pd.concat([train, test], axis=0)

#========================================================================
# card_id list by first active month
try:
    sys.argv[5]
    train_latest_id_list = np.load('../input/card_id_train_first_active_201712.npy')
    test_latest_id_list = np.load('../input/card_id_test_first_active_201712.npy')
    train = train.loc[train[key].isin(train_latest_id_list), :].reset_index(drop=True)
    test = test.loc[test[key].isin(test_latest_id_list), :].reset_index(drop=True)
    submit = []
except IndexError:
    pass
#========================================================================

model_list = utils.read_pkl_gzip('../model/201712/0122_elo_first_month201712_10seed_fold_model_list.gz')
use_cols = pd.read_csv('../model/201712/0122_elo_first_month201712_fold_model_use_cols.csv').values.reshape(-1,)

pred = np.zeros(len(train_test))
for model in model_list:
    pred += model.predict(train_test[use_cols])
pred /= len(model_list)

feature_name = '014_l02_elo_first_month201712_prediction'
utils.to_pkl_gzip(obj=pred, path='../features/5_tmp/{feature_name}')
#========================================================================
# Result
cv_score = np.mean(cv_list)
iter_avg = np.int(np.mean(iter_list))
#========================================================================

logger.info(f'''
#========================================================================
# {len(seed_list)}SEED CV SCORE AVG: {cv_score}
#========================================================================''')

#========================================================================
# Part of card_id Score
#  bench = pd.read_csv('../input/bench_LB3684_FAM_cv_score.csv')
bench = utils.read_pkl_gzip(
    '../stack/0206_125_stack_lgb_lr0.01_235feats_10seed_70leaves_iter1164_OUT29.8269_CV3-6215750935280235_LB.gz'
)[[key, 'pred_mean']].rename(columns={'pred_mean': 'bench_pred'})
df_pred = df_pred.merge(bench, on=key, how='inner')
part_score_list = []
part_N_list = []
fam_list = []
base_train['first_active_month'] = base_train['first_active_month'].map(
    lambda x: str(x)[:7])

for i in range(201501, 201713, 1):
    fam = str(i)[:4] + '-' + str(i)[-2:]
    df_part = base_train[base_train['first_active_month'] == fam]
    if len(df_part) < 1:
        continue
    part_id_list = df_part[key].values
Example #13
0
#========================================================================
# Data Load

win_path = f'../features/4_winner/*.gz'
model_path_list = [
    f'../model/LB3670_70leaves_colsam0322/*.gz', '../model/E2_lift_set/*.gz',
    '../model/E3_PCA_set/*.gz', '../model/E4_mix_set/*.gz'
]

model_path = model_path_list[model_no]

win_path_list = glob.glob(model_path)
#  win_path_list = glob.glob(model_path) + glob.glob(win_path) + glob.glob('../features/5_tmp/*.gz')

base = utils.read_pkl_gzip('../input/base_no_out_clf.gz')[[
    key, target, col_term, 'first_active_month', no_flg, 'clf_pred'
]]
#  base = utils.read_df_pkl('../input/base_term*')[[key, target, col_term, 'first_active_month']]
base[col_term] = base[col_term].map(
    lambda x: 24 if 19 <= x else 18 if 16 <= x and x <= 18 else 15
    if 13 <= x and x <= 15 else 12 if 9 <= x and x <= 12 else 8
    if 6 <= x and x <= 8 else 5 if x == 5 else 4)
#  nn_stack_plus = utils.read_pkl_gzip('../ensemble/NN_ensemble/0213_142_elo_NN_stack_E1_row99239_outpart-all_235feat_const1_lr0.001_batch128_epoch30_CV1.2724309982670599.gz')[[key, 'prediction']].set_index(key)
#  nn_stack_minus = utils.read_pkl_gzip('../ensemble/NN_ensemble/0213_145_elo_NN_stack_E1_row104308_outpart-all_235feat_const1_lr0.001_batch128_epoch30_CV4.864183650939903.gz')[[key, 'prediction']].set_index(key)
#  base.set_index(key, inplace=True)
#  base['nn_plus'] = nn_stack_plus['prediction']
#  base['nn_minus'] = nn_stack_minus['prediction']
#  base.reset_index(inplace=True)

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
HOME = os.path.expanduser("~")
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils

path_list = glob.glob('../features/1_first_valid/*.gz')

key = ''
old_key = '_pts_'
new_key = '_pst_'

for path in path_list:

    if (path.count(key)):
        #  if not(path.count(key)):
        #  if not(path.count(key)):
        feature = utils.read_pkl_gzip(path)

        rename_path = path.replace(old_key,
                                   new_key).replace('.gz', '').replace(
                                       '.gz', '').replace('.gz', '')
        utils.to_pkl_gzip(obj=feature, path=rename_path)
        os.system(f'rm {path}')

    else:
        feature = utils.read_pkl_gzip(path)
        rename_path = path.replace('.gz', '').replace('.gz',
                                                      '').replace('.gz', '')
        utils.to_pkl_gzip(obj=feature, path=rename_path)
        os.system(f'rm {path}')
train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1)
test = pd.concat(
    [base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)],
    axis=1)

self_predict = train.copy()

y = train[target].values

#========================================================================
# Outlier Setting
if out_part == 'part':
    # Exclude Difficult Outlier
    #  clf_result = utils.read_pkl_gzip('../stack/0111_145_outlier_classify_9seed_lgb_binary_CV0-9045939277654236_188features.gz')[[key, 'prediction']]
    clf_result = utils.read_pkl_gzip(
        '../stack/0130_214_outlier_classify_9seed_lgb_binary_CV0-9044513544501314_172features.gz'
    )[[key, 'pred_mean']]
    train = train.merge(clf_result, how='inner', on=key)
    #  tmp1 = train[train.prediction>0.01]
    #  tmp2 = train[train.prediction<0.01][train.target>-30]
    tmp1 = train[train.pred_mean > 0.01]
    tmp2 = train[train.pred_mean < 0.01][train.target > -30]
    train = pd.concat([tmp1, tmp2], axis=0, ignore_index=True)
    del tmp1, tmp2
    gc.collect()
    #  train.drop('prediction', axis=1, inplace=True)
    train.drop('pred_mean', axis=1, inplace=True)
elif out_part == 'all':
    #  Exclude Outlier
    train = train[train.target > -30]
Example #16
0
    params['subsample'] = 0.9
    params['colsample_bytree'] = 0.3
    params['min_child_samples'] = 30
try:
    colsample_bytree = float(sys.argv[8])
    params['colsample_bytree'] = colsample_bytree
except IndexError:
    colsample_bytree = params['colsample_bytree']

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Data Load

base = utils.read_pkl_gzip('../input/base_type_group.gz')[[
    key, target, col_term, 'first_active_month', no_flg, 'clf_pred', 'group'
]]
gr_col = 'group'

#  tmp = utils.read_pkl_gzip('../stack/0223_222_stack_future_amount_pred_fold4_leaves16_AUC_CV0.2999066985403468.gz').set_index(key)
#  tmp2 = utils.read_pkl_gzip('../stack/0223_222_stack_future_amount_pred_fold4_leaves16_CV1897.3342481032632.gz').set_index(key)
#  base.set_index(key, inplace=True)
#  base['1_pred'] = tmp['prediction']
#  amount_pred_cols = [col for col in tmp2.columns if col.count('pred')]
#  base[amount_pred_cols] = tmp2[amount_pred_cols]
#  base.reset_index(inplace=True)

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)

feature_list = utils.parallel_load_data(path_list=win_path_list)
Example #17
0
# ===========================================================================
key = 'SK_ID_CURR'
target = 'TARGET'
ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV']

# ===========================================================================
# DATA LOAD
# ===========================================================================
base = utils.read_df_pkl(path='../input/base_app*')
fname = 'app'
prefix = feat_no + f'{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')[[
    key, target, 'EXT_SOURCE_2'
]]

train_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]')
test_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]')
ir_mean = np.hstack((train_ir, test_ir))
df['stan_ir_mean@'] = ir_mean
df['stan_ir_mean@'].fillna('ir_nan', inplace=True)

num_split = 9
df['EXT_bin'] = pd.qcut(x=df['EXT_SOURCE_2'], q=num_split)
df['ir_bin'] = pd.qcut(x=df['stan_ir_mean@'], q=num_split)

col = f'neighbor{num_split**2}@'
df[col] = df[['EXT_bin',
              'ir_bin']].apply(lambda x: str(x[0]) + '_' + str(x[1])
                               if str(x[0]) != str(np.nan) else 'ext_nan',
                               axis=1)
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

# Data Load
base = utils.read_df_pkl('../input/base_first*')
path_list = glob.glob('../ensemble/*.gz')
#  path = '../stack/0127_120_stack_lgb_lr0.01_349feats_1seed_31leaves_iter3915_OUT0_CV1-139620018388889_LB.gz'
path_1 = '../ensemble/0112_123_stack_lgb_lr0.01_200feats_10seed_iter1121_OUT30.2024_CV3-649256498211181_LB3.687.gz'
path_2 = '../ensemble/0112_084_stack_lgb_lr0.01_200feats_10seed_OUT30.2199_CV3-649046125233803_LB3.687.gz'

#========================================================================
# First Month Group Score
#  for ratio_1, ratio_2 in zip(np.arange(0.1, 1.0, 0.1), np.arange(0.9, 0.0, -0.1)):
base['prediction'] = 0
#  filename = re.search(r'/([^/.]*).gz', path.replace('.', '-')).group(1)
pred_1 = utils.read_pkl_gzip(path_1)
pred_2 = utils.read_pkl_gzip(path_2)

base.set_index('card_id', inplace=True)
pred_1.set_index('card_id', inplace=True)
base['pred_1'] = pred_1['prediction']
base['pred_2'] = pred_2['prediction']
base['prediction'] = (base['pred_1'] + base['pred_2']) / 2
base['prediction'] = base['pred_1']
#  base['prediction'] = base['pred_2']
base.reset_index(inplace=True)
base = base[~base[target].isnull()]

#========================================================================
# Part of card_id Score
part_score_list = []
#  win_path = f'../model/old_201712/*.gz'
try:
    if not logger:
        logger = logger_func()
except NameError:
    logger = logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base_first*')
#  fm201712_all = utils.read_pkl_gzip('../model/201712/stack/0126_0933_elo_first_month201712_all_dist_all_03_stack_1seed_lr0-02_round75000_CV3-6547.gz')
#  fm201712_org = utils.read_pkl_gzip('../model/201712/stack/0126_0933_elo_first_month201712_org0_dist_all_03_stack_1seed_lr0-02_round75000_CV3-7252.gz')
fm201712 = utils.read_pkl_gzip(
    '../stack/0127_184_stack_no_lgb_lr0.02_128feats_10seed_31leaves_iter1107_FAM2017-12_FAMS1-16326_CV1-2217668492567508_LB.gz'
).set_index(key)
fm201711 = utils.read_pkl_gzip(
    '../stack/0127_184_stack_no_lgb_lr0.02_150feats_10seed_31leaves_iter994_FAM2017-11_FAMS1-32748_CV1-3777250084934298_LB.gz'
).set_index(key)
fm201710 = utils.read_pkl_gzip(
    '../stack/0127_185_stack_no_lgb_lr0.02_126feats_10seed_31leaves_iter493_FAM2017-10_FAMS1-74594_CV1-7751920449648786_LB.gz'
).set_index(key)
#========================================================================

base = base[base[target].isnull()]
base.set_index(key, inplace=True)
base['pred_17-12'] = fm201712['pred_mean']
base['pred_17-11'] = fm201711['pred_mean']
base['pred_17-10'] = fm201710['pred_mean']
        LGBM = lgb_ex(logger=logger,
                      metric=metric,
                      model_type=model_type,
                      ignore_list=ignore_list)
        LGBM.seed = seed
        cv_score_list = []
        no_update_cnt = 0

        if len(path) > 0:
            used_path += list(path).copy()
            valid_feat = utils.get_filename(path=path, delimiter='gz')

            # 検証するFeatureをデータセットに追加
            try:
                train[valid_feat] = utils.read_pkl_gzip(path)[:len(base_train)]
            except FileNotFoundError:
                continue
            except ValueError:
                continue
        else:
            valid_feat = 'base'
            path = 'base_path'

        # idを絞る
        train.sort_index(axis=1, inplace=True)

        logger.info(f'''
#========================================================================
# No: {i}/{len(valid_feat_list)-1}
# Valid Feature: {valid_feat}
Example #21
0
pl_length = 0


train_latest_id_list = np.load(f'../input/card_id_train_first_active_2017{fm_feat_pl[:2]}.npy')
test_latest_id_list = np.load(f'../input/card_id_test_first_active_2017{fm_feat_pl[:2]}.npy')

#========================================================================
# card_id list by first active month
try:
    if int(fm_feat_pl[:2])>0:
        first_month = f'2017-{fm_feat_pl[:2]}'

        if fm_feat_pl[-2:]=='pl':
            pred_path = glob.glob(f'../model/2017{fm_feat_pl[:2]}/stack/*org0_*')[0]
            pred_col = 'pred'
            pred_feat = utils.read_pkl_gzip(pred_path)
            train[pred_col] = pred_feat[:len(train)]
            train.loc[~train[key].isin(train_latest_id_list), target] = train.loc[~train[key].isin(train_latest_id_list), pred_col]

            tmp_test = test.copy()
            tmp_test[target] = pred_feat[len(train):]

            # first_active_monthが201712より前の場合、学習データセットから未来のfirst_active_monthを除外する
            if int(fm_feat_pl[:2])<12:
                base = base[base['first_active_month'] <= f'2017-{fm_feat_pl[:2]}']
                train = train.merge(base[key].to_frame(), how='inner', on=key)
                test = test.merge(base[key].to_frame(), how='inner', on=key)
                tmp_test = tmp_test.merge(base[key].to_frame(), how='inner', on=key)


            train = pd.concat([train, tmp_test], axis=0, ignore_index=True).drop(pred_col, axis=1)
Example #22
0
path_list = glob.glob('../stack/*.gz')
import pickle
import datetime
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

blend_path = glob.glob('../ensemble/*.csv')
blend_list = []

for path in blend_path:
    elem = pd.read_csv(path)
    blend_list.append(elem.copy())

blending = np.zeros(len(elem))
for elem in blend_list:
    pred = elem['target']
    blending += pred
blending /= len(blend_list)

submit = pd.read_csv('../input/sample_submission.csv')
submit['target'] = blending

clf = utils.read_pkl_gzip(
    '../stack/0112_155_outlier_classify_9seed_lgb_binary_CV0-9047260065151934_200features.gz'
)
clf = clf.iloc[-len(submit):, ].reset_index(drop=True)
submit.loc[clf.prediction > 0.45, 'target'] = -33.1

submit.to_csv(
    f'../submit/{start_time[4:12]}_elo_{len(blend_list)}blender_outlier_clf0.45_postprocessing.csv',
    index=False)
Example #23
0
                                 key=key,
                                 target=target,
                                 fold_type=fold_type,
                                 fold=fold,
                                 group_col_name=group_col_name,
                                 params=params,
                                 num_boost_round=num_boost_round,
                                 early_stopping_rounds=early_stopping_rounds,
                                 oof_flg=oof_flg)
else:
    import lightgbm as lgb
    from sklearn.model_selection import StratifiedKFold, train_test_split
    from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score

    # Dataset Setting
    train_ = utils.read_pkl_gzip('../py/train_tfidf.gz')
    test_ = utils.read_pkl_gzip('../py/test_tfidf.gz')
    from scipy.sparse import hstack, csr_matrix
    y = train[target]
    prediction = np.array([])
    train = hstack((csr_matrix(train.drop(['qid', target], axis=1)), train_))
    test = hstack((csr_matrix(test.drop(['qid', target], axis=1)), test_))

    ' KFold '
    if fold_type == 'stratified':
        folds = StratifiedKFold(n_splits=fold, shuffle=True,
                                random_state=seed)  # 1
        kfold = folds.split(train, y)

    for n_fold, (trn_idx, val_idx) in enumerate(kfold):
        x_train, x_val, y_train, y_val = train_test_split(train,