Ejemplo n.º 1
0
def bq_parallel(query, prefix):
    bq_client = bigquery.Client.from_service_account_json(key_file)
    df = bq_client.query(query).to_dataframe()
    print(f"Result Shape: {df.shape}")

    # prefix = 'f008_big_ins-cur-l3-'
    # prefix = 'f008_big_ins-pre-f3-'
    #  prefix = 'f008_big_ins-pre-l3-'
    base = utils.read_df_pkl('../input/base0*').set_index(key)
    df.set_index(key, inplace=True)
    df = base.join(df)
    utils.save_feature(df_feat=df, ignore_list=ignore_list, is_train=2, prefix=prefix, target=target)
Ejemplo n.º 2
0
def clean_prev(pre):
    logger.info(f'''
    #==============================================================================
    # PREV CLEANSING
    #=============================================================================='''
                )

    cash = 'Cash loans'
    revo = 'Revolving loans'
    pre = utils.read_df_pkl(path='../input/previous*.p')
    pre['AMT_CREDIT'] = pre['AMT_CREDIT'].where(pre['AMT_CREDIT'] > 0, np.nan)
    pre['AMT_ANNUITY'] = pre['AMT_ANNUITY'].where(pre['AMT_ANNUITY'] > 0,
                                                  np.nan)
    pre['AMT_APPLICATION'] = pre['AMT_APPLICATION'].where(
        pre['AMT_APPLICATION'] > 0, np.nan)
    pre['CNT_PAYMENT'] = pre['CNT_PAYMENT'].where(pre['CNT_PAYMENT'] > 0,
                                                  np.nan)
    pre['AMT_DOWN_PAYMENT'] = pre['AMT_DOWN_PAYMENT'].where(
        pre['AMT_DOWN_PAYMENT'] > 0, np.nan)
    pre['RATE_DOWN_PAYMENT'] = pre['RATE_DOWN_PAYMENT'].where(
        pre['RATE_DOWN_PAYMENT'] > 0, np.nan)

    pre['DAYS_FIRST_DRAWING'] = pre['DAYS_FIRST_DRAWING'].where(
        pre['DAYS_FIRST_DRAWING'] < 100000, np.nan)
    pre['DAYS_FIRST_DUE'] = pre['DAYS_FIRST_DUE'].where(
        pre['DAYS_FIRST_DUE'] < 100000, np.nan)
    pre['DAYS_LAST_DUE_1ST_VERSION'] = pre['DAYS_LAST_DUE_1ST_VERSION'].where(
        pre['DAYS_LAST_DUE_1ST_VERSION'] < 100000, np.nan)
    pre['DAYS_LAST_DUE'] = pre['DAYS_LAST_DUE'].where(
        pre['DAYS_LAST_DUE'] < 100000, np.nan)
    pre['DAYS_TERMINATION'] = pre['DAYS_TERMINATION'].where(
        pre['DAYS_TERMINATION'] < 100000, np.nan)
    #  pre['SELLERPLACE_AREA']          = pre['SELLERPLACE_AREA'].where(pre['SELLERPLACE_AREA']     <200, 200)

    ignore_list = [
        'SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_TYPE',
        'NAME_CONTRACT_STATUS'
    ]
    ' revo '
    ' RevolvingではCNT_PAYMENT, AMT系をNULLにする '
    #  for col in pre.columns:
    #      if col in ignore_list:
    #          logger.info(f'CONTINUE: {col}')
    #          continue
    #      pre[f'revo_{col}'] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']==revo, np.nan)
    #      pre[col] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']!=revo, np.nan)

    pre['NAME_TYPE_SUITE'].fillna('XNA', inplace=True)
    pre['PRODUCT_COMBINATION'].fillna('XNA', inplace=True)

    pre = utils.to_df_pkl(df=pre, path='../input', fname='clean_prev')
Ejemplo n.º 3
0
def clean_bureau(bur):
    logger.info(f'''
    #==============================================================================
    # BUREAU CLEANSING
    #=============================================================================='''
                )

    bur = utils.read_df_pkl(path='../input/bureau*.p')
    bur = bur[bur['CREDIT_CURRENCY'] == 'currency 1']
    bur['DAYS_CREDIT_ENDDATE'] = bur['DAYS_CREDIT_ENDDATE'].where(
        bur['DAYS_CREDIT_ENDDATE'] > -36000, np.nan)
    bur['DAYS_ENDDATE_FACT'] = bur['DAYS_ENDDATE_FACT'].where(
        bur['DAYS_ENDDATE_FACT'] > -36000, np.nan)
    bur['DAYS_CREDIT_UPDATE'] = bur['DAYS_CREDIT_UPDATE'].where(
        bur['DAYS_CREDIT_UPDATE'] > -36000, np.nan)
    bur = utils.to_df_pkl(df=bur, path='../input', fname='clean_bureau')
    params['colsample_bytree'] = 0.3
    params['min_child_samples'] = 50

else:
    params['subsample'] = 0.9
    params['colsample_bytree'] = 0.3
    params['min_child_samples'] = 30

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Data Load

tmp_path_list = glob.glob(f'../features/5_tmp/*.gz')

base = utils.read_df_pkl('../input/base_term*0*')[[key, target, col_term]]
base[col_term] = base[col_term].map(
    lambda x: 4 if x <= 4 else 5 if x <= 5 else 6 if 6 <= x and x <= 8 else 9
    if 9 <= x and x <= 12 else 15 if 13 <= x and x <= 15 else 18
    if 16 <= x and x <= 18 else 24 if 19 <= x else x)
print(base[col_term].value_counts())
sys.exit()

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)

win_path = sys.argv[1]
win_path = f'../features/4_winner/*.gz'
win_path = f'../model/LB3670_70leaves_colsam0322/*.gz'
# win_path_list = glob.glob(win_path) + tmp_path_list
win_path_list = glob.glob(win_path)
Ejemplo n.º 5
0
#          value = np.random.choice(a=value, size=val_len)
#      except ValueError:
#          pass
#      df_dict[uid] = " ".join(value)
#      return df_dict
#  tmp_dict = {}
#  p_list = pararell_process(pararell_val_join, tmp.items())
#  [tmp_dict.update(p) for p in p_list]
#  tmp_train = pd.Series(tmp_dict).to_frame()
#  tmp_train = tmp_train.join(train_df.set_index('qid'))
#  tmp_train.rename(columns={0:qt}, inplace=True)

#  utils.to_df_pkl(df=tmp_train, path='../input/', fname='wn_bagging_train')
#  sys.exit()

tmp_train = utils.read_df_pkl(path='../input/wn_bagging_train*.p')
tmp_train = tmp_train.head(50000)

## split to train and val
train_df, val_df = train_test_split(tmp_train, test_size=0.2, random_state=seed)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
# test_X = test_df["question_text"].fillna("_na_").values
Ejemplo n.º 6
0
argv[2]: feature_key
"""
# Basic Args
seed = 1208
set_type = 'all'
fold_n = 4
key, raw_target, ignore_list = MS_utils.get_basic_var()
ignore_list = [key, raw_target]
comment = sys.argv[1]

# Base
vi_col = 'f000_AvSigVersion'
base_path = '../input/base_exclude*'
base_path = '../input/base_Av*'
#  base_path = '../input/base_group*'
base = utils.read_df_pkl(base_path)

#========================================================================
# Make Validation
cv = KFold(n_splits=fold_n, shuffle=False, random_state=seed)
if is_debug:
    base_train = base[base[raw_target].isnull()].head(10000).sort_values(
        by=vi_col)
else:
    base_train = base[base[raw_target].isnull()].sort_values(by=vi_col)

kfold = list(cv.split(base_train[[key, 'country_group']], base_train[vi_col]))
del base_train
gc.collect()
base = base[[key, raw_target]]
#========================================================================
Ejemplo n.º 7
0
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, pararell_process, mkdir_func
from preprocessing import get_dummies
from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding

logger = logger_func()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

#========================================================================
# Global Variable
from info_home_credit import hcdr_key_cols
key, target, ignore_list = hcdr_key_cols()
#========================================================================

app = utils.read_df_pkl(path='../input/clean_app*.p')[[key, target]]

filekey='bureau'
filepath = f'../input/clean_{filekey}*.p'
df = utils.read_df_pkl(path=filepath)
df = df.merge(app, on=key, how='inner')

train = df[~df[target].isnull()]
test = df[df[target].isnull()]

categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

mkdir_func(f'../features/{filekey}')

#========================================================================
# Numeric Feature Save
def raddar_top_freq_merchant_agg(df, agg_term, new_max, new_min, old_max,
                                 old_min):
    #========================================================================
    if debug:
        df = auth1.head(100000)


#     new_max, new_min, old_max, old_min = 0,-1,-2,-3
#     agg_term = -20
# Args Setting
    level = [key, 'merchant_id', 'month_lag']
    # merchantの利用頻度を集計する期間を絞る(month_lagの値)
    new_month_lag_max = new_max
    new_month_lag_min = new_min
    old_month_lag_max = old_max
    old_month_lag_min = old_min
    agg_term = agg_term
    #========================================================================

    #========================================================================
    # Aggregation
    print("Aggregation Start!!")

    aggs = {}
    aggs['purchase_amount_new'] = ['sum']
    aggs['installments'] = ['mean', 'max', 'sum']
    df_agg = df.groupby(level).agg(aggs)

    new_cols = get_new_columns(name='', aggs=aggs)
    df_agg.columns = new_cols
    df_agg[f'purchase_amount_new_sum_per_installments_sum'] = df_agg[
        f'purchase_amount_new_sum'] / df_agg[f'installments_sum']

    mer_cnt = df.groupby([key, 'merchant_id'
                          ])['month_lag'].nunique().reset_index().rename(
                              columns={'month_lag': 'month_lag_cnt'})
    df_agg = df_agg.reset_index().merge(mer_cnt,
                                        how='inner',
                                        on=[key, 'merchant_id'])
    #========================================================================

    #========================================================================
    # month_lag別に切り出して集計を行う
    print("Aggregate Term Setting")
    df_merchant = df_agg[df_agg['month_lag_cnt'] > 1]
    del df_agg
    gc.collect()
    df_merchant.drop(['month_lag_cnt'], axis=1, inplace=True)

    # 期間を絞る
    new_term = new_month_lag_max - new_month_lag_min
    old_term = old_month_lag_max - old_month_lag_min
    new = df_merchant[df_merchant['month_lag'] <= new_month_lag_max][
        df_merchant['month_lag'] >= new_month_lag_min]
    old = df_merchant[df_merchant['month_lag'] <= old_month_lag_max][
        df_merchant['month_lag'] >= old_month_lag_min]

    feat_cols = [
        col for col in df_merchant.columns
        if col.count('amount') or col.count('install')
    ]
    aggs = {}
    for col in feat_cols:
        if col.count('install') and not (col.count('per')):
            aggs[col] = ['mean']
        else:
            aggs[col] = ['sum']

    # 複数month_lagをもつデータの場合は、集計する
    if new_term > 0:
        new = new.groupby([key, 'merchant_id'])[feat_cols].agg(aggs)
        new_cols = get_new_columns(name='', aggs=aggs)
        new.columns = new_cols
    else:
        new.set_index([key, 'merchant_id'], inplace=True)

    if old_term > 0:
        old = old.groupby([key, 'merchant_id'])[feat_cols].agg(aggs)
        new_cols = get_new_columns(name='', aggs=aggs)
        old.columns = new_cols
    else:
        old.set_index([key, 'merchant_id'], inplace=True)
    #========================================================================

    new.reset_index(inplace=True)
    old.reset_index(inplace=True)
    #========================================================================
    # oldに存在するがnewにいないcard_id, merchantをnewにもたせる。
    print("Get Lost Merchant and Card ID")
    new['flg'] = 1
    tmp_cols = [key, 'merchant_id']
    old_lost = old[tmp_cols].merge(new[tmp_cols + ['flg']],
                                   how='left',
                                   on=[key, 'merchant_id'])
    old_lost = old_lost[old_lost['flg'].isnull()]
    old_lost = old_lost[tmp_cols]
    new = pd.concat([new, old_lost], ignore_index=True)
    new.drop('flg', axis=1, inplace=True)
    #========================================================================

    #========================================================================
    # Make Ratio Feature
    print("Make Ratio Feature")
    feat_cols = [
        col for col in new.columns
        if col.count('amount') or col.count('install')
    ]
    fname = f'flag{new_month_lag_max}_{new_month_lag_min}-plag{old_month_lag_max}_{old_month_lag_min}'
    new = new.merge(old, how='left', on=[key, 'merchant_id'])
    for col in feat_cols:
        new[f"{fname}_{col}"] = new[col + '_x'] / new[col + '_y']
        new[f"{fname}_{col}"].fillna(0, inplace=True)
    #========================================================================

    #========================================================================
    # card_id * merchant_id別のtop frequency ranking
    # all term version
    if debug:
        df = df.head(1000000)
    df_term = df[df['month_lag'] >= agg_term]
    mer_cnt = df_term.groupby([key, 'merchant_id'
                               ])['month_lag'].nunique().reset_index().rename(
                                   columns={'month_lag': 'month_lag_cnt'})
    mer_cnt.sort_values(by=[key, 'month_lag_cnt'],
                        ascending=False,
                        inplace=True)
    mer_cnt = utils.row_number(df=mer_cnt, level=key)
    mer_cnt.set_index([key, 'merchant_id'], inplace=True)

    df_merchant = new.set_index(tmp_cols).join(mer_cnt).reset_index()

    del new, old, mer_cnt
    gc.collect()

    use_cols = [key, 'merchant_id'] + [
        col for col in df_merchant.columns if col.count('flag')
    ] + ['month_lag_cnt', 'row_no']
    df_merchant = df_merchant[use_cols]
    #========================================================================

    #========================================================================
    # merchant_id別に集計を行ったら、
    # 1. それらを更に集計する. frequencyが高いmerchantのみで集計するパターンも作る
    # 2. frequencyの高いmerchnatでまとめてtop1~10カラムを作る ------->>> frequencyについては、全体と直近半年の両パターンでカウントし、特徴を作る
    #========================================================================

    feat_cols = [col for col in df_merchant.columns if col.count('plag')]

    #========================================================================
    # Top10のfrequency merchantをまとめてカラムにする
    df_merchant = df_merchant[df_merchant['row_no'] <= 10]
    df_merchant = df_merchant.set_index([key, 'row_no'])[feat_cols].unstack()
    # Rename
    fname = 'auth1_all'
    df_merchant.columns = [
        f"{fname}_top-M{col[1]}_merchant_{col[0]}"
        for col in df_merchant.columns
    ]

    #========================================================================

    #========================================================================
    # Save Feature
    if agg_term < -13:
        prefix = '244_rad'
    else:
        prefix = f'245_rad_min_Mlag{agg_term}'

    print(f"{prefix} Feature Saving...")

    base = utils.read_df_pkl('../input/base_no_out*')
    base = base[[key, target]].set_index(key)
    base = base.join(df_merchant)
    base.fillna(-1, inplace=True)
    del df_merchant
    gc.collect()

    # elo_save_feature(df_feat=base, prefix=prefix)
    print('Complete!')
Ejemplo n.º 9
0
pd.set_option('max_rows', 200)
from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

# ===========================================================================
# global variables
# ===========================================================================
key = 'SK_ID_CURR'
target = 'TARGET'
ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV']

# ===========================================================================
# DATA LOAD
# ===========================================================================
base = utils.read_df_pkl(path='../input/base_app*')
fname = 'app'
prefix = feat_no + f'{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')[[
    key, target, 'EXT_SOURCE_2'
]]

train_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]')
test_ir = utils.read_pkl_gzip('../features/4_winner/[email protected]')
ir_mean = np.hstack((train_ir, test_ir))
df['stan_ir_mean@'] = ir_mean
df['stan_ir_mean@'].fillna('ir_nan', inplace=True)

num_split = 9
df['EXT_bin'] = pd.qcut(x=df['EXT_SOURCE_2'], q=num_split)
df['ir_bin'] = pd.qcut(x=df['stan_ir_mean@'], q=num_split)
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from preprocessing import get_ordinal_mapping
from utils import logger_func
logger = logger_func()

from dimensionality_reduction import go_bhtsne, UMAP

#========================================================================
# Data Load

win_path = f'../features/4_winner/*.gz'
#  win_path_list = glob.glob(win_path)
win_path_list = glob.glob(f'../features/0_exp/*.gz')

base = utils.read_df_pkl('../input/base_term*')
feature_list = utils.parallel_load_data(path_list=win_path_list)
df_feat = pd.concat(feature_list, axis=1)

#========================================================================

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
prefix = f"224_emb_{start_time[4:12]}"
feat_list = [col for col in df_feat.columns]
emb_list = []
feat_num = 10
ignore_list = [key, target, 'merchant_id', 'first_avtive_month', 'index']

for i in range(10):
    np.random.seed(i)
    tmp_list = np.random.choice(feat_list, feat_num)
if num_leaves > 40:
    params['subsample'] = 0.8757099996397999
    #  params['colsample_bytree'] = 0.7401342964627846
    params['colsample_bytree'] = 0.3
    params['min_child_samples'] = 50
else:
    params['subsample'] = 0.9
    params['colsample_bytree'] = 0.3
    params['min_child_samples'] = 30

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Data Load
win_path = '../features/4_winner/*.gz'
base = utils.read_df_pkl('../input/base_first*0*')
win_path_list = glob.glob(win_path)
# tmp_path_listには検証中のfeatureを入れてある
tmp_path_list = glob.glob('../features/5_tmp/*.gz')
win_path_list += tmp_path_list

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
feature_list = utils.parallel_load_data(path_list=win_path_list)
df_feat = pd.concat(feature_list, axis=1)
train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1)
#  test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)
test = []

#========================================================================
# LGBM Setting
Ejemplo n.º 12
0
def quara_load_data():
    # read pickle
    train = utils.read_df_pkl(path='../input/train*.p')
    test = utils.read_df_pkl(path='../input/test*.p')
    return train, test
Ejemplo n.º 13
0
def main():

    #========================================================================
    # Data Load
    #========================================================================
    win_path_list = glob.glob(win_path)
    train_path_list = []
    test_path_list = []
    for path in win_path_list:
        if path.count('train'):
            train_path_list.append(path)
        elif path.count('test'):
            test_path_list.append(path)

    #  train_feature_list = utils.pararell_load_data(path_list=train_path_list, delimiter='gz')
    #  test_feature_list = utils.pararell_load_data(path_list=test_path_list, delimiter='gz')
    #  train = pd.concat(train_feature_list, axis=1)
    #  test = pd.concat(test_feature_list, axis=1)
    df = utils.read_df_pkl('../input/appli*')
    train = df[df[target] >= 0]
    test = df[df[target] == -1]

    metric = 'auc'
    fold = 5
    fold_type = 'stratified'
    group_col_name = ''
    dummie = 1
    oof_flg = True
    LGBM = lgb_ex(logger=logger,
                  metric=metric,
                  model_type=model_type,
                  ignore_list=ignore_list)

    train, _ = LGBM.data_check(df=train)
    test, drop_list = LGBM.data_check(df=test, test_flg=True)
    if len(drop_list):
        train.drop(drop_list, axis=1, inplace=True)
        test.drop(drop_list, axis=1, inplace=True)

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_prediction(train=train,
                                 test=test,
                                 key=key,
                                 target=target,
                                 fold_type=fold_type,
                                 fold=fold,
                                 group_col_name=group_col_name,
                                 params=params,
                                 num_boost_round=num_boost_round,
                                 early_stopping_rounds=early_stopping_rounds,
                                 oof_flg=oof_flg)

    #========================================================================
    # Result
    #========================================================================
    cv_score = LGBM.cv_score
    result = LGBM.prediction
    cv_feim = LGBM.cv_feim
    feature_num = len(LGBM.use_cols)

    cv_feim.to_csv(
        f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv'
    )

    #========================================================================
    # X-RAYの計算と出力
    # Args:
    #     model    : 学習済のモデル
    #     train    : モデルの学習に使用したデータセット
    #     col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、
    #                データセットの全カラムについて計算を行うが、
    #                計算時間を考えると最大30カラム程度を推奨。
    #========================================================================
    xray = False
    if xray:
        train.reset_index(inplace=True)
        train = train[LGBM.use_cols]
        result_xray = pd.DataFrame()
        N_sample = 150000
        max_point = 30
        for fold_num in range(fold):
            model = LGBM.fold_model_list[fold_num]
            if fold_num == 0:
                xray_obj = Xray_Cal(logger=logger,
                                    ignore_list=ignore_list,
                                    model=model)
            xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train,
                                                   col_list=train.columns,
                                                   fold_num=fold_num,
                                                   N_sample=N_sample,
                                                   max_point=max_point)
            tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True)

            if len(result_xray):
                result_xray.merge(tmp_xray.drop('N', axis=1),
                                  on=['feature', 'value'],
                                  how='inner')
            else:
                result_xray = tmp_xray.copy()
            del tmp_xray
            gc.collect()

        xray_col = [col for col in result_xray.columns if col.count('xray')]
        result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1)
        result_xray.to_csv(
            f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv'
        )
        sys.exit()

    submit = pd.read_csv('../input/sample_submission.csv')
    #  submit = []

    #========================================================================
    # STACKING
    #========================================================================
    if len(stack_name) > 0:
        logger.info(f'result_stack shape: {LGBM.result_stack.shape}')
        utils.to_pkl(
            path=
            f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp",
            obj=LGBM.result_stack)
    logger.info(
        f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv'
    )

    #========================================================================
    # Submission
    #========================================================================
    if len(submit) > 0:
        if stack_name == 'add_nest':
            test[target] = result
            test = test.reset_index()[[
                key, target
            ]].groupby(key)[target].mean().reset_index()
            submit = submit[key].to_frame().merge(test, on=key, how='left')
            submit[target].fillna(0, inplace=True)
            submit.to_csv(
                f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
                index=False)
        else:
            submit[target] = result
            submit.to_csv(
                f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
                index=False)
Ejemplo n.º 14
0
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

# ===========================================================================
# global variables
# ===========================================================================
key = 'SK_ID_CURR'
target = 'TARGET'
ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV']

# ===========================================================================
# DATA LOAD
# ===========================================================================
fname = 'app'
prefix = feat_no + f'{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')


# Null Count
df['Null_Cnt@'] = df.isnull().sum(axis=1)
# Document
doc_list = [col for col in df.columns if col.count('DOCUMENT')]
df['DOCUMENT_SUM@'] = df[doc_list].sum(axis=1)
# Document 3,5,6,8,11,18
doc_list2 = [col for col in doc_list if col.count('DOCUMENT') and ( col.count('_3') or col.count('_5') or col.count('_6') or col.count('_8') or col.count('_11') or col.count('_18') ) ]
df['DOCUMENT_SUM2@'] = df[doc_list2].sum(axis=1)

#  cat_list = get_categorical_features(df=df, ignore_list=ignore_list)

# ボツ
#  df['NEW_REGION_POPULATION_RELATIVE@'] = (df['REGION_POPULATION_RELATIVE']*10000).astype('int')
Ejemplo n.º 15
0
prev_key = 'SK_ID_PREV'
target = 'CNT_PAYMENT'
ignore_list.remove('TARGET')
ignore_list.append(target)

acr = 'AMT_CREDIT'
aan = 'AMT_ANNUITY'
adp = 'AMT_DOWN_PAYMENT'
cpy = 'CNT_PAYMENT'
co_type = 'NAME_CONTRACT_TYPE'
dd = 'DAYS_DECISION'

#========================================================================
# Data Load
#========================================================================
app = utils.read_df_pkl('../input/clean_app*')
amt_list = [col for col in app.columns if col.count('AMT_')]

prev = utils.read_df_pkl('../input/clean_prev*')[[key, dd, acr, aan, cpy]]
df = prev.merge(app.drop(amt_list, axis=1), on=key, how='inner')

days_list = [
    col for col in df.columns
    if col.count('DAYS') and not (col.count('DECISION'))
]
for col in days_list:
    df[col] = df[col].values - df[dd].values
df.drop([dd, 'TARGET'], axis=1, inplace=True)

train = df[~df[cpy].isnull()]
test = app
Ejemplo n.º 16
0
    select_type = 'rank'
    select_num = 50000
#========================================================================

#========================================================================
# Global Variable
COMPETITION_NAME = 'home-credit-default-risk'
sys.path.append(f"../py")
from info_home_credit import hcdr_key_cols
key, target, ignore_list = hcdr_key_cols()
#========================================================================

#========================================================================
# Data Load
feim_path = glob.glob('../valid/use_feim/*.csv')[0]
base = utils.read_df_pkl('../input/base0*')[[key, target]].set_index(key)
manage = FeatureManage(key, target)
manage.set_base(base)
if select_type == 'rank':
    train, test = manage.feature_matrix(feim_path=feim_path, rank=select_num)
if select_type == 'gain':
    train, test = manage.feature_matrix(feim_path=feim_path, gain=select_num)

if is_debug:
    train = train.head(10000)
    test = test.head(500)
Y = train[target]
print(train.shape, test.shape)
#========================================================================

# Basic Args
Ejemplo n.º 17
0
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import datetime
import sys
import re
import gc
import glob

import os
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, pararell_process
logger = logger_func()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

train = utils.read_df_pkl('../input/train0*.p')

print(train)
sys.exit()
Ejemplo n.º 18
0
    model = Model(inputs=inp, outputs=predictions)
    adam = optimizers.SGD(lr=learning_rate)

    if is_multi:
        model = multi_gpu_model(model, gpus=gpu_count)
    model.compile(optimizer=adam,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


#========================================================================

#========================================================================
# Load Feature Matrix
base = utils.read_df_pkl('../input/base_Av*')
tx_train = utils.read_df_pkl('../input/0306_MS_NLP_train*').values
x_test = utils.read_df_pkl('../input/0306_MS_NLP_test*').values
#========================================================================

#========================================================================
# Make Validation
seed = 1208
fold_n = 4
vi_col = 'f000_AvSigVersion'
base_train = base[~base[target].isnull()]
base_test = base[base[target].isnull()]
# sort前にyを取得
y = base_train[target].values
base_train.sort_values(vi_col, inplace=True)
base = base[[key, target]]
Ejemplo n.º 19
0
from sklearn.metrics import mean_squared_error, roc_auc_score
#========================================================================
# Keras
# Corporación Favorita Grocery Sales Forecasting
sys.path.append(f'{HOME}/kaggle/data_analysis/model')
from nn_keras import MS_NN
from keras import callbacks
from keras import optimizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
#========================================================================

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

base = utils.read_df_pkl('../input/base_group*')[[
    key, target, 'country_group'
]]

# feat_path_list = glob.glob('../features/4_winner/*.gz')
train, test = MS_utils.get_dataset(base=base,
                                   feat_path='../features/4_winner/*.gz',
                                   is_debug=False)
del base
gc.collect()

if is_debug:
    train = train.head(10000)
    test = test.head(500)

#========================================================================
# Categorical Encode
#========================================================================
# Data Load

win_path = f'../features/4_winner/*.gz'
#  win_path = f'../features/1_first_valid/*.gz'
#  win_path = f'../model/LB3670_70leaves_colsam0322/*.gz'
#  win_path = f'../model/LB3679_48leaves_colsam03/*.gz'
#  win_path = f'../model/LB3684_48leaves_colsam03/*.gz'
tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') + glob.glob(
    f'../features/0_exp/*.gz')
#  tmp_path_list = glob.glob(f'../features/5_tmp/*.gz')
win_path_list = glob.glob(win_path) + tmp_path_list
#  win_path_list = glob.glob(win_path)

base = utils.read_df_pkl('../input/base_term*0*')[[
    key, target, col_term, 'first_active_month'
]]
fam_base = utils.read_df_pkl('../input/fam_165_176*0*')[[key, target]]

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)

feature_list = utils.parallel_load_data(path_list=win_path_list)

df_feat = pd.concat(feature_list, axis=1)

train = pd.concat([base_train, df_feat.iloc[:len(base_train), :]], axis=1)
test = pd.concat(
    [base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)],
    axis=1)
sys.path.append(f'{HOME}/kaggle/data_analysis/library')
import utils
from utils import get_categorical_features, get_numeric_features, reduce_mem_usage, elo_save_feature
from preprocessing import get_dummies
import datetime

from tqdm import tqdm
import time
import sys
from joblib import Parallel, delayed

key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_avtive_month']

df_train = utils.read_df_pkl('../input/train0*')
df_train.set_index(key, inplace=True)
df_test = utils.read_df_pkl('../input/test0*')
df_test.set_index(key, inplace=True)

df_hist = pd.read_csv('../input/historical_transactions.csv')

df_hist['purchase_amount_new'] = np.round(
    df_hist['purchase_amount'] / 0.00150265118 + 497.06, 2)

df_hist['installments'] = df_hist['installments'].map(
    lambda x: 1 if x < 1 else 1 if x > 100 else x)

#========================================================================
# Dataset Load
use_cols = [
# Ensemble 1
set1 = f'../model/E1_set/*.gz'
# Ensemble 2
set2 = f'../model/E2_set/*.gz'
# Ensemble 3
set3 = f'../model/E3_set/*.gz'
# Ensemble 4
set4 = f'../model/E4_set/*.gz'

set_list = [set1, set2, set3, set4]
win_path = set_list[int(sys.argv[2])]

win_path_list = glob.glob(win_path)

base = utils.read_df_pkl('../input/base_term*0*')[[
    key, target, 'first_active_month'
]]
base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
feature_list = utils.parallel_load_data(path_list=win_path_list)
df = pd.concat(feature_list, axis=1)
train = pd.concat([base_train, df.iloc[:len(base_train), :]], axis=1)
test = pd.concat(
    [base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

if out_part == 'no_out':
    train = train[train[target] > -30]
#========================================================================
Ejemplo n.º 23
0
    utils.to_df_pkl(df=ccb, path='../input', fname='clean_ccb')


if __name__ == "__main__":

    #  with utils.timer("To Pickle"):
    #      to_pkl()

    with utils.timer("Cleansing"):

        #  app = utils.read_df_pkl(path='../input/application_train_test*.p')
        #  clean_app(app)
        #  del app
        #  gc.collect()

        bur = utils.read_df_pkl(path='../input/bureau*.p')
        clean_bureau(bur)
        del bur
        gc.collect()

        #  pre = utils.read_df_pkl(path='../input/prev*.p')
        #  clean_prev(pre)
        #  del pre
        #  gc.collect()
        #  pos = utils.read_df_pkl(path='../input/POS*.p')
        #  clean_pos(pos)
        #  del pos
        #  gc.collect()
        #  ins = utils.read_df_pkl(path='../input/install*.p')
        #  clean_ins(ins)
        #  del ins
Ejemplo n.º 24
0
pd.set_option('max_rows', 200)
from feature_engineering import base_aggregation, diff_feature, division_feature, product_feature, cnt_encoding, select_category_value_agg, exclude_feature, target_encoding

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

# ===========================================================================
# global variables
# ===========================================================================
key = 'SK_ID_CURR'
target = 'TARGET'
ignore_list = [key, target, 'SK_ID_BUREAU', 'SK_ID_PREV']

# ===========================================================================
# DATA LOAD
# ===========================================================================
base = utils.read_df_pkl(path='../input/base_app*')
fname = 'app'
prefix = f'{feat_no}{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')

train = df[~df[target].isnull()]
test = df[df[target].isnull()]

neighbor = '110_app_neighbor81@'
train[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
test[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
combi = [neighbor, cat]
cat_list = get_categorical_features(df=df, ignore_list=ignore_list)

#========================================================================
# TARGET ENCODING
Ejemplo n.º 25
0
# Global Variable
from info_home_credit import hcdr_key_cols
key, target, ignore_list = hcdr_key_cols()
#========================================================================

prev_key = 'SK_ID_PREV'
acr = 'AMT_CREDIT'
aan = 'AMT_ANNUITY'
adp = 'AMT_DOWN_PAYMENT'
cpy = 'CNT_PAYMENT'
co_type = 'NAME_CONTRACT_TYPE'
dd = 'DAYS_DECISION'


# Curren Applicationに対するCNT_PAYMENTの予測値
df = utils.read_df_pkl('../input/clean_cpy*').reset_index()[[key, 'AMT_CREDIT', 'AMT_ANNUITY', target]]


#========================================================================
# Current ApplicationのInterest Rateを計算
#========================================================================
#========================================================================
# Dima's Interest_Rate
#========================================================================
df['3']=df['AMT_ANNUITY']*3-df['AMT_CREDIT']
df['3']=df['3']/df['AMT_CREDIT']
df['dima_ir_3@']=df['3'].apply(lambda x: x if 0<=x<=0.5 else np.nan)
df['6']=df['AMT_ANNUITY']*6-df['AMT_CREDIT']
df['6']=df['6']/df['AMT_CREDIT']
df['dima_ir_6@']=df['6'].apply(lambda x: x if 0<=x<=0.5 else np.nan)
df['9']=df['AMT_ANNUITY']*9-df['AMT_CREDIT']
Ejemplo n.º 26
0
#========================================================================

prev_key = 'SK_ID_PREV'
acr = 'AMT_CREDIT'
aan = 'AMT_ANNUITY'
adp = 'AMT_DOWN_PAYMENT'
cpy = 'CNT_PAYMENT'
co_type = 'NAME_CONTRACT_TYPE'
dd = 'DAYS_DECISION'

#========================================================================
# Previous ApplicationのInterest Rateを計算
#========================================================================
prev_ir = False
if prev_ir:
    app = utils.read_df_pkl('../input/clean_app*')[[key, target]]
    df = utils.read_df_pkl('../input/clean_prev*')
    df = df[[key, prev_key, dd, acr, aan, cpy, adp, co_type]].merge(app, on=key, how='inner')
    df = df[~df[cpy].isnull()]

    for cnt in range(3, 64, 3):
        if cnt<=60:
            ir = ( (df[aan].values * cnt) / df[acr].values ) - 1.0
            df[f'ir_{cnt}@'] = ir
            df[f'ir_{cnt}@'] = df[f'ir_{cnt}@'].map(lambda x: x if (0.08<x) and (x<0.5) else np.nan)
            print(f"{cnt} :", len(df[f'ir_{cnt}@'].dropna()))
            if len(df[f'ir_{cnt}@'].dropna())<len(df)*0.001:
                df.drop(f'ir_{cnt}@', axis=1, inplace=True)
                continue
        else:
            ir = ( (df[aan].values * df[cpy].values) / df[acr].values ) - 1.0
Ejemplo n.º 27
0
                                 verbose=verbose,
                                 callbacks=callbacks,
                                 validation_data=te_gen,
                                 nb_val_samples=nb_val_samples,
                                 max_q_size=10)

    def predict(self, X, batch_size=128):
        y_preds = predict_batch(self.model, X, batch_size=batch_size)
        return y_preds


logger.info("Keras Setup Complete!!")

#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base*')
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

# train_path_list = sorted(train_path_list)[:20]
# test_path_list  = sorted(test_path_list)[:20]

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
Ejemplo n.º 28
0
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam, SGD
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


base = utils.read_df_pkl(path='../input/base_Av*')
if is_make:
    #========================================================================
    # Dataset Load
    with utils.timer('Download Train and Test Data.\n'):
        train, test = MS_utils.get_dataset(base=base, feat_path='../features/4_winner/*.gz', is_cat_encode=False)


        nlp_cols = [
            'Engine'
            ,'OSVersion'
            ,'AppVersion'
            ,'AvSigVersion'
            ,'SkuEdition'
            ,'SmartScreen'
            ,'Census_OSArchitecture'
Ejemplo n.º 29
0
        logger = logger_func()
except NameError:
    logger = logger_func()
#========================================================================
"""
argv[1]: comment
argv[2]: feature_key
argv[3]: group
"""

# Columns
base_path = '../input/base_exclude*'
base_path = '../input/base_Av*'
key, target, ignore_list = MS_utils.get_basic_var()
ignore_list = [key, target, 'country_group', 'down_flg']
base = utils.read_df_pkl(base_path)[[key, target, 'country_group']]

# Basic Args
seed = 1208
set_type = 'all'

comment = sys.argv[1]

if sys.argv[2].count('f'):
    train, test = MS_utils.get_feature_set(feat_key=sys.argv[2],
                                           base_path=base_path)
else:
    train, test = MS_utils.get_dataset(base=base)
print(train.shape, test.shape)

if is_debug:
def make_fam_dist(base_fam, multi, is_drop=False):

    # ========================================================================
    # Args
    key = 'card_id'
    target = 'target'
    is_viz = False
    base_year = int(base_fam[:4])
    base_month = int(base_fam[-2:])
    max_fam = '2017-12'
    min_fam = '2011-11'
    result_id = []
    # ========================================================================

    # ========================================================================
    # Data Load
    base = utils.read_df_pkl('../input/base_first*')
    base[target] = base[target].map(lambda x: np.round(x, 1))
    # ========================================================================
    df_list = []

    val_cnt = base[target].value_counts()
    val_cnt.name = 'all'
    df_list.append(val_cnt.copy())

    base_1712 = base[base['first_active_month'] == base_fam]
    val_cnt = base_1712[target].value_counts()

    # もう使わないかも?
    is_max = False
    if is_max:
        val_cnt_max = val_cnt.max()
        val_cnt /= val_cnt_max
    val_cnt.name = base_fam
    df_list.append(val_cnt)
    df = pd.concat(df_list, axis=1)


    def arange_ratio(df, multi, is_viz=False):
        df[base_fam] *= multi
        df['diff'] = df['all'] - df[base_fam]
        diff_len = len(df[df['diff'] < 0])
        if is_viz:
            display(df[df['diff'] < 0])
        if diff_len > limit_diff_num:
            return -1
        return 0


    df = pd.concat(df_list, axis=1)

    # もう使わないかも?
    target_max = np.max(df.dropna().index.tolist())
    target_min = np.min(df.dropna().index.tolist())

    cnt_0_fam = df.loc[0.0, :][base_fam]
    cnt_0_all = df.loc[0.0, :]['all']
    #  multi = int(cnt_0_all / cnt_0_fam)+1

    #  while True:
    #      tmp = df.copy().dropna()
    #      is_minus = arange_ratio(tmp, multi)
    #      if is_minus:
    #          multi -= 1
    #          continue
    #      break

    print(f"multi: {multi}")
    df[base_fam] *= multi
    if is_drop:
        df_loy = df.dropna()
        loy_list = list(df_loy.index)
    else:
        loy_list = list(np.arange(target_min, target_max, 0.1))

    # ========================================================================
    # Sampling
    # ========================================================================
    before = 0
    for i in loy_list:
        loy = np.round(i, 1)
        df_id = base[base[target] == loy]
        if len(df_id) == 0:
            continue
        sample = df.loc[loy, base_fam]
        if sample == sample:
            before = sample
        else:
            sample = before
        sample = np.int(sample)
        remain = sample
        sampling_id = []

        if remain==0:
            continue

        if is_viz:
            print('''
    #========================================================================
    # Sampling Start!!
    ''')

        for i in range(100):

            is_add = True
            if i == 0:
                yyyymm = base_fam
                tmp_id = df_id[df_id['first_active_month'] == yyyymm]
            else:
                year = base_year
                month = base_month + i

                if month > 12:
                    num_year = month//12
                    year = year + num_year
                    month = month - 12 * num_year
                elif month < 1:
                    num_year = month//12
                    if num_year == 0:
                        year = year - 1
                        month = month + 12
                    else:
                        num_year *= -1
                        year = year - num_year
                        month = month + 12*num_year
                if month < 10:
                    month = f'0{month}'

                yyyymm = f"{year}-{month}"

                if yyyymm < min_fam or yyyymm > max_fam:
                    is_add = False
                else:
                    tmp_id = df_id[df_id['first_active_month'] == yyyymm]
                    if i > 0 and yyyymm == base_fam:
                        is_add = False
    
            # ========================================================================
            # Sampling
            if is_add:
                if is_viz:
                    print(f'future yyyymm: {yyyymm}')
                id_list = list(tmp_id[key].values)
                if len(id_list) <= remain:
                    sampling_id += id_list
                else:
                    sampling_id += list(np.random.choice(id_list,
                                                         remain, replace=False))
    
                if is_viz:
                    print(f"sampling_id: {len(sampling_id)} / {sample}")
            # ========================================================================
    
            remain = sample - len(sampling_id)
            if remain <= 0:
                break
    
            is_add = True
            if i > 0:
                year = base_year
                month = base_month - i
    
                if month > 12:
                    num_year = month//12
                    year = year + num_year
                    month = month - 12 * num_year
                elif month < 1:
                    num_year = month//12
                    if num_year == 0:
                        year = year - 1
                        month = month + 12
                    else:
                        num_year *= -1
                        year = year - num_year
                        month = month + 12*num_year
                if month < 10:
                    month = f'0{month}'
    
                yyyymm = f"{year}-{month}"
    
                if yyyymm < min_fam or yyyymm > max_fam:
                    is_add = False
                else:
                    tmp_id = df_id[df_id['first_active_month'] == yyyymm]
    
                # ========================================================================
                # Sampling
                if is_add:
                    if is_viz:
                        print(f'past yyyymm: {yyyymm}')
                    id_list = list(tmp_id[key].values)
                    if len(id_list) <= remain:
                        sampling_id += id_list
                    else:
                        sampling_id += list(np.random.choice(id_list,
                                                             remain, replace=False))
    
                    if is_viz:
                        print(f"sampling_id: {len(sampling_id)} / {sample}")
                # ========================================================================
    
                remain = sample - len(sampling_id)
                if remain <= 0:
                    break

        result_id += sampling_id
        if is_viz:
            print(f"loy:{loy} | {len(sampling_id)}/{sample} | All: {len(result_id)}")
            print('''
    # Sampling Complete!!
    #========================================================================
    ''')
    print(f"All: {len(result_id)} | Unique: {len(np.unique(result_id))}")
    print(base[base[key].isin(result_id)]
            ['first_active_month'].value_counts().head())
    print(base[base[key].isin(result_id)]['target'].value_counts().head())

    return result_id