Exemple #1
0
def make_raw_feature(data,
                     prefix='',
                     select_list=[],
                     ignore_list=[],
                     extension='pkl',
                     path='../features/1_first_valid',
                     word='',
                     logger=False):

    for tmp_col in data.columns:
        if tmp_col in ignore_list: continue
        if len(select_list) > 0:
            if f'{prefix}{tmp_col}' not in select_list: continue
        if len(word) > 0:
            if not (tmp_col.count(word)): continue

        new_col = tmp_col.replace('/', '_').replace(':', '_').replace(
            ' ', '_').replace('.', '_').replace('"', '')
        data.rename(columns={tmp_col: new_col}, inplace=True)

        if extension.count('npy'):
            np.save(f'{path}/{prefix}{new_col}.npy', data[new_col].values)
        elif extension.count('csv'):
            data[new_col].to_csv(f'{path}/{prefix}{new_col}.csv')
        elif extension.count('pkl'):
            utils.to_pkl_gzip(path=f'{path}/{prefix}{new_col}.fp',
                              obj=data[new_col].values)
Exemple #2
0
def get_tfidf(text_list):
    '''
    Explain:
        TFIDFを出力したいテキストリストを作成する
    Args:
        text_list(list): split前のテキストリスト
    Return:
        sparse csr_matrix: TFIDF値が入ったスパースな行列
    '''

    # Get the tfidf
    logger.info("Calculate TFIDF...")
    tfidf_vec = TfidfVectorizer(
        max_features = 100000,
        min_df=3,
        max_df=0.8,
        stop_words="english",
        analyzer='word',
        #  analyzer='char',
        strip_accents='unicode',
        ngram_range=(1,3),
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    ).fit(text_list)

    df_tfidf = tfidf_vec.transform(text_list)
    utils.to_pkl_gzip(obj=df_tfidf, path='./df_tfidf')
Exemple #3
0
def make_cat_features(df, filekey):
    mkdir_func(f'../features/{filekey}')
    train = df[~df[target].isnull()]
    test = df[df[target].isnull()]
    categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

    #========================================================================
    # Categorical Feature Encode
    #========================================================================
    # Factorize
    logger.info("Factorize Start!!")
    for col in categorical_features:
        for col in categorical_features:
            train[f"lbl_{col}@"], indexer = pd.factorize(train[col])
            test[f"lbl_{col}@"] = indexer.get_indexer(test[col])

    # Count Encoding
    logger.info("Count Encoding Start!!")
    for col in categorical_features:
        train = cnt_encoding(train, col, ignore_list=ignore_list)
        test = cnt_encoding(test, col, ignore_list=ignore_list)

    #========================================================================
    # Categorical Feature Save
    #========================================================================
    for col in train.columns:
        logger.info("Saving Features...")
        if col.count('@'):
            result_train = train[col].values
            result_test = test[col].values
            logger.info(f"COL: {col} | LENGTH: {len(result_train)}")
            utils.to_pkl_gzip(obj=result_train, path=f'../features/{filekey}/train_{col}')
            utils.to_pkl_gzip(obj=result_test, path=f'../features/{filekey}/test_{col}')
Exemple #4
0
def make_num_features(df, filekey):
    mkdir_func(f'../features/{filekey}')

    #  if filekey.count('bur'):
    df = interact_feature(df, filekey)

    #========================================================================
    # カテゴリの内容別にNumeric Featureを切り出す
    #========================================================================
    num_list = get_numeric_features(df=df, ignore_list=ignore_list)
    cat_list = get_categorical_features(df=df, ignore_list=[])

    #  few_list = []
    #  for cat in tqdm(cat_list):
    #      for val in tqdm(df[cat].drop_duplicates()):
    #          length = len(df[df[cat]==val])
    #          if length < len(df)*0.002:
    #              few_list.append(val)
    #              continue
    #          for num in num_list:
    #          #  pararell_process(, num_list)
    #              df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan)
    #              df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan)

    logger.info(f'{fname} SET SHAPE : {df.shape}')

    #========================================================================
    # Feature Save & Categorical Encoding & Feature Save 
    #========================================================================
    train = df[~df[target].isnull()]
    test = df[df[target].isnull()]

    categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

    #========================================================================
    # Numeric Feature Save
    #========================================================================
    for col in train.columns:
        if col in categorical_features:continue
        result_train = train[col].values
        result_test = test[col].values
        logger.info(f"COL: {col} | LENGTH: {len(result_train)}")
        utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}')
        if col != target:
            utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
Exemple #5
0
def one_base_agg(df, prefix):
    # =======================================================================
    # 集計するカラムリストを用意
    # =======================================================================
    num_list = get_numeric_features(df=df, ignore=ignore_list)

    # 並列処理→DFが重いと回らないかも
    #  arg_list = []
    #  for num in num_list:
    #      for method in method_list:
    #          tmp = df[[key, num]]
    #          arg_list.append([tmp, key, num, method, prefix, '', base])

    #  ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
    #  call_list = pararell_process(base_agg_wrapper, arg_list)
    #  result = pd.concat(call_list, axis=1)

    #  for col in result.columns:
    #      if not(col.count('@')) or col in ignore_list:
    #          continue
    #      #  utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values)
    #  sys.exit()

    # 直列処理
    for num in num_list:
        for method in method_list:
            tmp = df[[key, num]]
            tmp_result = base_aggregation(df=tmp,
                                          level=key,
                                          method=method,
                                          prefix=prefix,
                                          feature=num)
            result = base.merge(tmp_result, on=key, how='left')
            renu = result[result[target].isnull()]
            for col in result.columns:
                if not (col.count('@')) or col in ignore_list:
                    continue
                if exclude_feature(col, result[col].values): continue
                if exclude_feature(col, renu[col].values): continue

                file_path = f"{dir}/{col}.fp"
                #  utils.to_pickle(path=file_path, obj=result[col].values)
                utils.to_pkl_gzip(obj=result[col].values, path=file_path)
            del result, renu, tmp_result
            gc.collect()
Exemple #6
0
def multi_level_agg(df, prefix):
    # =======================================================================
    # 複数カテゴリの組み合わせを集計値に置き換える
    # =======================================================================
    method_list = ['mean']
    num_list = ['EXT_SOURCE_2']
    cat_list = get_categorical_features(df=df, ignore_list=ignore_list)
    cat_combi = combinations(cat_list, 2)
    #  amt_list = [col for col in num_list if col.count('AMT_')]
    #  days_list = [col for col in num_list if col.count('DAYS_')]

    # 直列処理
    for com in cat_combi:
        for num in num_list:
            for method in method_list:
                base = df[[key, target] + list(com)].drop_duplicates()
                tmp = df[list(com)+[num]]
                tmp_result = base_aggregation(
                    df=tmp, level=list(com), method=method, prefix=prefix, feature=num)
                result = base.merge(tmp_result, on=list(com), how='left')

                for col in result.columns:
                    if not(col.count('@')) or col in ignore_list:
                        continue

                    train_feat = result[result[target]>=0][col].values
                    test_feat = result[result[target].isnull()][col].values
                    col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_')
                    train_file_path = f"../features/1_first_valid/train_{col}"
                    test_file_path = f"../features/1_first_valid/test_{col}"

                    utils.to_pkl_gzip(obj=train_feat, path=train_file_path)
                    utils.to_pkl_gzip(obj=test_feat, path=test_file_path)

                    logger.info(f'''
                    #========================================================================
                    # COMPLETE MAKE FEATURE : {train_file_path}
                    #========================================================================''')
                del result, tmp_result
                gc.collect()
Exemple #7
0
def single_level_agg(df, prefix):
    # =======================================================================
    # 1つのカテゴリを集計値に置き換える
    # =======================================================================
    method_list = ['mean', 'var']
    num_list = ['EXT_SOURCE_2']
    cat_list = get_categorical_features(df=df, ignore_list=ignore_list)
    #  amt_list = [col for col in num_list if col.count('AMT_')]
    #  days_list = [col for col in num_list if col.count('DAYS_')]

    # 直列処理
    for cat in cat_list:
        if len(df[cat].unique())<=3:
            continue
        for num in num_list:
            for method in method_list:
                base = df[[key, cat, target]].drop_duplicates()
                tmp = df[[cat, num]]
                tmp_result = base_aggregation(
                    df=tmp, level=cat, method=method, prefix=prefix, feature=num)
                result = base.merge(tmp_result, on=cat, how='left')

                for col in result.columns:
                    if not(col.count('@')) or col in ignore_list:
                        continue

                    train_file_path = f"../features/1_first_valid/train_{col}"
                    test_file_path = f"../features/1_first_valid/test_{col}"

                    utils.to_pkl_gzip(obj=result[result[target]>=0][col].values, path=train_file_path)
                    utils.to_pkl_gzip(obj=result[result[target].isnull()][col].values, path=test_file_path)

                    logger.info(f'''
                    #========================================================================
                    # COMPLETE MAKE FEATURE : {train_file_path}
                    #========================================================================''')
                del result, tmp_result
                gc.collect()
print("""
# =============================================================================
""")

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')
    print('submit')
    utils.submit(SUBMIT_FILE_PATH, COMMENT)
    os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt')
    os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/')
else:
    SUBMIT_FILE_PATH = SUBMIT_FILE_PATH.replace('.csv.gz', f'_{SEED}.pkl')
    utils.to_pkl_gzip(sub[['HasDetections']], SUBMIT_FILE_PATH)
    SUBMIT_FILE_PATH += '.gz'
    os.system(f'gsutil cp {SUBMIT_FILE_PATH} gs://malware_onodera/')
    os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt')
    os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/')
"""

gsutil cp gs://malware_onodera/*.gz ../output/
gsutil cp gs://malware_onodera/*.txt LOG/

"""

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
Exemple #9
0
    prediction += test_pred

    y_pred = model.predict(X=x_val, batch_size=batch_size)
    stack_prediction[val_idx] = y_pred

    sc_score = roc_auc_score(y_val, y_pred)
    logger.info(f'''
    #========================================================================
    # FOLD {n_fold} SCORE: {sc_score}
    #========================================================================'''
                )
    cv_list.append(sc_score)

prediction /= len(kfold)
cv_score = np.mean(cv_list)
logger.info(f'''
#========================================================================
# CV SCORE: {cv_score}
#========================================================================''')

train_pred = pd.Series(stack_prediction, name='prediction').to_frame()
test_pred = pd.Series(prediction, name='prediction').to_frame()
train_pred[key] = list(train.index)
test_pred[key] = list(test.index)
df_pred = pd.concat([train_pred, test_pred], axis=0)

utils.to_pkl_gzip(
    path=
    f"../stack/{start_time[4:12]}_stack_{model_type}_lr{learning_rate}_{len(num_list)}feats_{len(seed_list)}seed_{batch_size/gpu_count}batch_OUT_CV{str(cv_score).replace('.', '-')}_LB",
    obj=df_pred)
    # Scoring
    err = (y_val - y_pred)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'RMSE: {score} | SUM ERROR: {err.sum()}')
    score_list.append(score)
    #========================================================================

cv_score = np.mean(score_list)
logger.info(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
#========================================================================''')

#========================================================================
# Stacking
test_pred /= fold
test['prediction'] = test_pred
stack_test = test[[key, 'prediction']]
result_list.append(stack_test)
df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target,
                                                                 axis=1)
df_pred = base.merge(df_pred, how='inner', on=key)
print(f"Stacking Shape: {df_pred.shape}")

utils.to_pkl_gzip(
    obj=df_pred,
    path=
    f'../stack/{start_time[4:12]}_elo_NN_stack_linear{is_linear*1}_{len(use_cols)}feat_lr{learning_rate}_batch{batch_size}_epoch{N_EPOCHS}_CV{cv_score}'
)
#========================================================================
Exemple #11
0
    if path.count('year'):
        print(np.unique(df['year'].values))
        sys.exit()
        for year in np.unique(df['year'].values):

            base = utils.read_df_pkl('../input/base0*')
            base = base.merge(df.query(f"year=={year}"),
                              how='left',
                              on='card_id')
            train = base[~base['target'].isnull()]
            test = base[base['target'].isnull()]

            for col in df.columns:
                if col.count('__'):
                    utils.to_pkl_gzip(
                        path=
                        f"../features/1_first_valid/{feat_no}train_{col.replace('__', '@').replace('his_', '')}_year{year}",
                        obj=train[col].values)
                    utils.to_pkl_gzip(
                        path=
                        f"../features/1_first_valid/{feat_no}test_{col.replace('__', '@').replace('his_', '')}_year{year}",
                        obj=test[col].values)

    else:
        if path.count('dow') and path.count('timezone'):

            for month in np.unique(df['latest_month_no'].values):
                for dow in np.unique(df['dow'].values):
                    for timezone in np.unique(df['timezone'].values):

                        base = utils.read_df_pkl('../input/base0*')
                        base = base.merge(
Exemple #12
0
    
    base_train = base[~base[target].isnull()]
    base_test = base[base[target].isnull()]
    
    scaler = StandardScaler()
    scaler.fit(train_test[use_cols])
    x_test = scaler.transform(test[use_cols])
    
    #========================================================================
    # 
    df = scaler.transform(train_test[use_cols])
    del train_test
    gc.collect()
    for num, col in enumerate(tqdm(use_cols)):
        feature = df[:, num]
        utils.to_pkl_gzip(obj=feature, path=f'../features/2_second_valid/stan_{col}')
    sys.exit()
    #========================================================================
    
    del train_test
    gc.collect()
else:
    base = pd.concat([train[[key, target, 'country_group']], test[[key, target, 'country_group']] ], axis=0)
    base_train = base[~base[target].isnull()]
    use_cols = [col for col in train.columns if col not in ignore_list]
    x_test = test[use_cols]

Y = train[target]

print(f"Train: {train.shape} | Test: {test.shape}") 
# ========================================================================
#          train.reset_index(inplace=True)
#          out_ids = train.loc[train.target<-30, key].values
#          out_val = train.loc[train.target<-30, target].values
#          if len(seed_list)==1:
#              out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values
#          else:
#              out_pred = df_pred[df_pred[key].isin(out_ids)]['pred_mean'].values
#          out_score = np.sqrt(mean_squared_error(out_val, out_pred))
#      else:
#          out_score = 0
#  else:
#      out_score = 0

# Save
utils.to_pkl_gzip(
    path=
    f"../stack/{start_time[4:12]}_stack_{model_type}_lr{learning_rate}_{feature_num}feats_multi{multi}_val{sys.argv[4]}_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_TERM{base_term}_CV{str(cv_score).replace('.', '-')}_LB",
    obj=df_pred)

# 不要なカラムを削除
drop_feim_cols = [
    col for col in cv_feim.columns
    if col.count('importance_') or col.count('rank_')
]
cv_feim.drop(drop_feim_cols, axis=1, inplace=True)
drop_feim_cols = [
    col for col in cv_feim.columns
    if col.count('importance') and not (col.count('avg'))
]
cv_feim.drop(drop_feim_cols, axis=1, inplace=True)
cv_feim.to_csv(
    f'../valid/{start_time[4:12]}_valid_{model_type}_lr{learning_rate}_{feature_num}feats_multi{multi}_val{sys.argv[4]}_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_TERM{base_term}_CV{cv_score}_LB.csv',
Exemple #14
0
        #  utils.to_pkl_gzip(obj=base_test[[key, 'prediction']], path=save_path)

        #========================================================================

cv_score = np.mean(score_list)
logger.info(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
#========================================================================''')

test_pred /= fold_no + 1

base_train['prediction'] = oof_pred
base_test['prediction'] = test_pred

#========================================================================
# Stacking
if is_oof:
    df_stack = pd.concat([base_train, base_test], axis=0, ignore_index=True)
    print(f"DF Stack Shape: {df_stack.shape}")
#========================================================================

if is_debug:
    sys.exit()

utils.to_pkl_gzip(
    obj=df_stack,
    path=
    f'../stack/{start_time}_NN_NLP_{comment}_feat{X_train.shape[1]}_fold{fold_n}_CV{cv_score}_LB'
)
Exemple #15
0
    score, tmp_oof, tmp_pred, feim, _ = ml_utils.Classifier(
        model_type=model_type,
        x_train=x_train,
        y_train=y_train,
        x_val=x_val,
        y_val=y_val,
        x_test=x_test,
        params=params,
        seed=seed,
        get_score=metric,
        get_feim=get_feim)

    logger.info(f"Fold{num_fold} CV: {score}")
    score_list.append(score)
    oof_pred[val_idx] = tmp_oof
    y_test += tmp_pred

y_test /= len(get_fold_list)
pred_col = 'prediction'
base[pred_col] = np.hstack((oof_pred, y_test))
base = base[[key, pred_col]]
#========================================================================
# Saving
utils.to_pkl_gzip(
    obj=base,
    path=
    f'../output/{start_time[4:12]}_stack_{model_type}_FOLD-{get_fold}_feat{n_features}_seed{seed}_{comment}'
)
#========================================================================
Exemple #16
0
    #     return pararell_write_lda_feat(*args)
    # def pararell_write_lda_feat(i, bow):
    #     tmp = np.zeros(topics+1)
    # ===
    topic = lda.get_document_topics(bow)
    for (tp_no, prob) in topic:
        mx[i][tp_no] = prob
# Pararell
#         tmp[tp_no] = prob
#     tmp[topics+1] = i
#     return tmp
# p_list = pararell_process(pararell_wrapper, arg_list)

cols = [f"{feat_no}_topic{i}@" for i in range(20)]
df_lda = pd.DataFrame(mx, columns=cols)

train_idx = train.index
test_idx = test.index
lda_train = df_lda.loc[train_idx, :]
lda_test = df_lda.loc[test_idx, :]

#========================================================================
# Save Feature
#========================================================================
logger.info("Save Features...")
for col in lda_train.columns:
    utils.to_pkl_gzip(obj=lda_train[col].values,
                      path=f'../features/1_first_valid/train_{col}')
    utils.to_pkl_gzip(obj=lda_test[col].values,
                      path=f'../features/1_first_valid/test_{col}')
Exemple #17
0
        #========================================================================

    cv_score = np.mean(score_list)

    #========================================================================
    # Stacking
    test_pred /= fold_no + 1
    test['prediction'] = test_pred
    stack_test = test[[key, 'prediction']]

    result_list.append(stack_test)
    df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target,
                                                                     axis=1)
    if key not in base:
        base.reset_index(inplace=True)
    df_pred = base[[key, target]].merge(df_pred, how='inner', on=key)

    print(f'''
    # =====================================================================
    #  SCORE AVG: {cv_score}
    # =====================================================================''')

    #========================================================================
    # Save Stack
    feature = df_pred['prediction'].values
    utils.to_pkl_gzip(
        path=
        f"../features/1_first_valid/{start_time[4:12]}_stack_{model_type}_set-{set_type}_valid-{valid_type}_seed{fold_seed}_feat{len(use_cols)}_CV{cv_score}_LB",
        obj=feature)
    #========================================================================
#========================================================================
cv_score = LGBM.cv_score
test_pred = LGBM.prediction
cv_feim = LGBM.cv_feim
feature_num = len(LGBM.use_cols)

cv_feim.to_csv(
    f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv',
    index=False)

#========================================================================
# STACKING
if len(stack_name) > 0:
    logger.info(f'result_stack shape: {LGBM.result_stack.shape}')
    utils.to_pkl_gzip(
        path=
        f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features",
        obj=LGBM.result_stack)
logger.info(
    f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_{metric}_{cv_score}.csv'
)
#========================================================================

#========================================================================
# Submission
if len(submit) > 0:
    submit[target] = test_pred
    submit.to_csv(
        f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
        index=False)
#========================================================================
Exemple #19
0
if len(stack_name) > 0:
    logger.info(f'result_stack shape: {df_pred.shape}')
    if len(seed_list) > 1:
        pred_cols = [col for col in df_pred.columns if col.count('predict')]
        df_pred['pred_mean'] = df_pred[pred_cols].mean(axis=1)
        df_pred['pred_std'] = df_pred[pred_cols].std(axis=1)
#========================================================================

#========================================================================
# Result
cv_score = np.mean(cv_list)
iter_avg = np.int(np.mean(iter_list))
#========================================================================

logger.info(f'''
#========================================================================
# {len(seed_list)}SEED CV SCORE AVG: {cv_score}
#========================================================================''')

# Save
try:
    if int(sys.argv[2]) == 0:
        utils.to_pkl_gzip(
            path=
            f"../stack/{start_time[4:12]}_stack_pred_{stack_name}_lr{learning_rate}_{feature_num}feats_{len(seed_list)}seed_{num_leaves}leaves_iter{iter_avg}_CV{str(cv_score).replace('.', '-')}",
            obj=df_pred)
except ValueError:
    pass
except TypeError:
    pass
#========================================================================
# CV SCORE AVG: {cv_score}
#========================================================================''')

#========================================================================
# Stacking
test_pred /= fold
test['prediction'] = test_pred
stack_test = test[[key, 'prediction']]
result_list.append(stack_test)
df_pred = pd.concat(result_list, axis=0, ignore_index=True).drop(target,
                                                                 axis=1)
df_pred = base.merge(df_pred, how='inner', on=key)
print(f"Stacking Shape: {df_pred.shape}")

utils.to_pkl_gzip(obj=df_pred,
                  path=f'../output/{start_time[4:11]}_elo_NN_stack_CV{score}')
#========================================================================

sys.exit()

#========================================================================
# Part of card_id Score
bench = pd.read_csv('../input/bench_LB3684_FAM_cv_score.csv')
part_score_list = []
part_N_list = []
fam_list = []
#  for i in range(201101, 201713, 1):
for i in range(201501, 201713, 1):
    fam = str(i)[:4] + '-' + str(i)[-2:]
    df_part = base_train[base_train['first_active_month'] == fam]
    if len(df_part) < 1:
Exemple #21
0
    test_pred += np.squeeze(model.predict(x_test))
    #========================================================================

    #========================================================================
    # Scorring
    score = roc_auc_score(y_val, y_pred)
    print(f'AUC: {score}')
    score_list.append(score)
    #========================================================================

cv_score = np.mean(score_list)
logger.info(f'''
#========================================================================
# CV SCORE AVG: {cv_score}
#========================================================================''')

test_pred /= num_ + 1

base_train['prediction'] = oof_pred
base_test['prediction'] = test_pred

#========================================================================
# Stacking
df_stack = pd.concat([base_train, test], axis=0, ignore_index=True)
print(f"DF Stack Shape: {df_stack.shape}")
utils.to_pkl_gzip(
    obj=df_stack[[key, 'prediction']],
    path=
    f'../stack/{start_time[4:12]}_MS_stack_NN_E{set_no+1}_batch{batch_size}_epoch{N_EPOCHS}_CV{cv_score}'
)
#========================================================================
Exemple #22
0
df['dima_ir_max@'] = df[ir_cols].max(axis=1)
df['dima_ir_min@'] = df[ir_cols].min(axis=1)
ir_cols = [col for col in df.columns if col.count('dima') and col.count('ir')]

# CNT_PAYMENT系->過学習してるっぽい?
#  df['dima_Pred_CPY_diff_lengthX@'] = df['CNT_PAYMENT'].values - df['dima_lengthX@'].values
#  df['dima_Cal_CPY_diff_lengthX@'] = df['dima_lengthX@'].values - (df['AMT_CREDIT'].values / df['AMT_ANNUITY'].values)
#  train_file_path = f"../features/1_first_valid/train_{cpy}"
#  test_file_path = f"../features/1_first_valid/test_{cpy}"

# Feature Save
for col in ir_cols:
    if not(col.count('@')) or col in ignore_list:
        continue
    if not(col.count('ir_3@')) and not(col.count('ir_6@')) and not(col.count('ir_9@')):
        continue
    train_feat = df[df[target]>=0][col].values
    test_feat = df[df[target].isnull()][col].values
    col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_')
    train_file_path = f"../features/1_first_valid/train_{col}"
    test_file_path = f"../features/1_first_valid/test_{col}"

    utils.to_pkl_gzip(obj=train_feat, path=train_file_path)
    utils.to_pkl_gzip(obj=test_feat, path=test_file_path)

    logger.info(f'''
    #========================================================================
    # COMPLETE MAKE FEATURE : {train_file_path}
    #========================================================================''')

Exemple #23
0
print('with mybest:', sub['HasDetections'].corr(sub_best['HasDetections'],
                                                method='spearman'))

print("""
# =============================================================================
# write down these info to benchmark.xlsx
# =============================================================================
""")

[print(f'{k:<25}: {RESULT_DICT[k]}') for k in RESULT_DICT]

print("""
# =============================================================================
""")

# save
#sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')
utils.to_pkl_gzip(sub[['HasDetections']],
                  SUBMIT_FILE_PATH.replace('.csv.gz', f'_{SEED}.pkl'))

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    print('submit')
    utils.submit(SUBMIT_FILE_PATH, COMMENT)

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
Exemple #24
0
        # card_id listにする
        trn_list = []
        val_list = []
        for trn, val in kfold:
            trn_ids = train.iloc[trn][key].values
            val_ids = train.iloc[val][key].values
            trn_list.append(trn_ids)
            val_list.append(val_ids)
        kfold = [trn_list, val_list]

    #  else:
    #      kfold = False
    #      fold_type = 'kfold'
    #========================================================================
    if not (os.path.exists(kfold_path)):
        utils.to_pkl_gzip(obj=kfold, path=kfold_path)

    train.sort_index(axis=1, inplace=True)
    test.sort_index(axis=1, inplace=True)

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_prediction(train=train,
                                 test=test,
                                 key=key,
                                 target=target,
                                 fold_type=fold_type,
                                 fold=fold,
                                 group_col_name=group_col_name,
                                 params=params,
Exemple #25
0
# =============================================================================
# main
# =============================================================================
if __name__ == "__main__":
    utils.start(__file__)
    
    # train
    tr = utils.load_train(['object_id'])
    
    df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_train_v3_20181215.pkl.gz')
    df = pd.merge(tr, df, on='object_id', how='left')
    df.reset_index(drop=True, inplace=True)
    get_feature(df)
    
    del df['object_id']
    df.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl')
    
    # test
    te = utils.load_test(['object_id'])
    df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_test_v3_20181215.pkl.gz')
    df = pd.merge(te, df, on='object_id', how='left')
    df.reset_index(drop=True, inplace=True)
    get_feature(df)
    
    del df['object_id']
    df = df.add_prefix(PREF+'_')
    utils.to_pkl_gzip(df, f'../data/test_{PREF}.pkl')
    
    utils.end(__file__)

Exemple #26
0
for path in path_list:

    #  fname = 'his_' + re.search(r'his_([^/.]*).csv', path).group(1)
    fname = re.search(r'feat([^/.]*)_auth', path).group(1)
    feat_no = f"{fname}_au1_"
    df = pd.read_csv(path)

    base = utils.read_df_pkl('../input/base0*')
    base = base.merge(df, how='left', on='card_id')
    train = base[~base['target'].isnull()]
    test = base[base['target'].isnull()]

    for col in df.columns:
        if col.count('__'):
            utils.to_pkl_gzip(
                path=
                f"../features/1_first_valid/{feat_no}train_{col.replace('__', '@').replace('his_', '')}@",
                obj=train[col].values)
            utils.to_pkl_gzip(
                path=
                f"../features/1_first_valid/{feat_no}test_{col.replace('__', '@').replace('his_', '')}@",
                obj=test[col].values)
        else:
            utils.to_pkl_gzip(
                path=f"../features/1_first_valid/{feat_no}train_{col}@",
                obj=train[col].values)
            utils.to_pkl_gzip(
                path=f"../features/1_first_valid/{feat_no}test_{col}@",
                obj=test[col].values)
Exemple #27
0
        #  score, tmp_oof, tmp_pred, feim = ml_utils.Classifier(
        score, tmp_oof, tmp_pred, feim, model = ml_utils.Regressor(
            model_type=model_type,
            x_train=x_train,
            y_train=y_train,
            x_val=x_val,
            y_val=y_val,
            x_test=x_test,
            params=params,
            seed=seed,
            get_score=metric,
            get_model=get_model)
        if get_model:
            utils.to_pkl_gzip(
                obj=model,
                path=
                f'../model/{start_time[4:11]}_{comment}_{target}_{model_type}_fold{num_fold}_feat{len(use_cols)}_{metric}-{score}'
            )
            del model
            gc.collect()

        feim_list.append(
            feim.set_index('feature').rename(
                columns={'importance': f'imp_{num_fold}'}))

        logger.info(f"Fold{num_fold} CV: {score}")
        score_list.append(score)
        oof_pred[val_idx] = tmp_oof
        y_test += tmp_pred

    feim = pd.concat(feim_list, axis=1)
Exemple #28
0
        score, tmp_oof, tmp_pred, feim, _ = ml_utils.Classifier(
            model_type=model_type,
            x_train=x_train,
            y_train=y_train,
            x_val=x_val,
            y_val=y_val,
            x_test=x_test,
            params=params,
            seed=seed,
            get_score=metric)
        feim_list.append(
            feim.set_index('feature').rename(
                columns={'importance': f'imp_{num_fold}'}))

        logger.info(f"Fold{num_fold} CV: {score}")
        score_list.append(score)
        oof_pred[val_idx] = tmp_oof
        y_test += tmp_pred

    pred_col = 'prediction'
    base[pred_col] = np.hstack((oof_pred, y_test))
    base = base[[key, pred_col]]
    #========================================================================
    # Saving
    utils.to_pkl_gzip(
        obj=base,
        path=
        f'../output/{start_time[4:12]}_stack_{model_type}_FOLD-{get_fold}_feat{len(x_train.columns)}_{comment}'
    )
    #========================================================================
Exemple #29
0
                                    how='inner')
df_stack = pd.concat([train, test], ignore_index=True, axis=0)
print(f"After Stack Shape: {df_stack.shape}")

y_train = train[target].values
y_pred = train[pred_col].values
from sklearn.metrics import roc_auc_score
cv_score = roc_auc_score(y_train, y_pred)
logger.info(f'''
#========================================================================
# CV: {cv_score}
#========================================================================''')

#========================================================================
# Saving
feim.to_csv(
    f'../valid/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB.csv',
    index=True)
utils.to_pkl_gzip(
    obj=df_stack,
    path=
    f'../stack/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB'
)

submit = pd.read_csv('../input/sample_submission.csv').set_index(key)
submit[target] = test[pred_col].values
submit.to_csv(
    f'../submit/{start_time[4:12]}_{model_type}_SET-{set_type}_feat{len(x_train.columns)}_{comment}_CV{str(cv_score)[:7]}_LB.csv',
    index=True)
#========================================================================
Exemple #30
0
    # Test内で標準化
    test =df[df[target].isnull()]
    test['bur_bin'] = 'test'
    df = pd.concat([train, test], axis=0).sort_index()

#========================================================================
# Current ApplicationのInterest Rateを計算
#========================================================================

# CNT_PAYMENT
file_path = f"../features/1_first_valid/"

<<<<<<< HEAD
# Current Application CNT_PAYMENT Save as Feature
utils.to_pkl_gzip(obj=df[~df[target].isnull()][cpy].values, path=train_file_path)
utils.to_pkl_gzip(obj=df[df[target].isnull()][cpy].values, path=test_file_path)
utils.to_pkl_gzip(obj=df[~df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=train_file_path)
utils.to_pkl_gzip(obj=df[df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=test_file_path)
=======
#  Current Application CNT_PAYMENT Save as Feature
#  utils.to_pkl_gzip(obj=df[~df[target].isnull()][cpy].values, path=file_path+f'train_{cpy}@')
#  utils.to_pkl_gzip(obj=df[df[target].isnull()][cpy].values, path=file_path +f'test_{cpy}@')
#  utils.to_pkl_gzip(obj=df[~df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=file_path+'train_Pred_CPY_diff_Cal_CPY@')
#  utils.to_pkl_gzip(obj=df[df[target].isnull()][ 'Pred_CPY_diff_Cal_CPY@' ].values, path=file_path+'test_Pred_CPY_diff_Cal_CPY@')
#  sys.exit()
>>>>>>> 0e2043f2852717c0cf66a8e72ef2fe8f222d4e5e

# 金利が何回分の支払いに対して発生しているか不明なので、3回刻みで一通り作る
for cnt in range(9, 40, 3):
#  for cnt in range(27, 46, 3):