Example #1
0
def eval_check_feature(df_train, df_test, is_corr=False):
    # 情報をもたない or 重複してるようなfeatureを除く
    print("* Check Unique Feature.")
    list_unique_drop = drop_unique_feature(df_train, df_test)
    
    if len(list_unique_drop):
        print(f"  * {len(list_unique_drop)}feature unique drop and move trush")
        print(list_unique_drop)
        for col in list(set(list_unique_drop)):
            from_dir = 'valid'
            to_dir = 'valid_trush'
#             if col.count('raw'):
#                 from_dir = 'raw_use'
#                 to_dir = 'raw_trush'
#             else:
#                 from_dir = 'org_use'
#                 to_dir = 'org_trush'
            try:
                move_feature([col], from_dir, to_dir)
            except FileNotFoundError:
                from_dir = 'valid'
                to_dir = 'valid_trush'
                move_feature([col], from_dir, to_dir)
                
    return list_unique_drop
Example #2
0
         
 if cnt==3:
     with open(check_score_path, 'a') as f:
         line = f'{feature_name},{cv}\n'
         f.write(line)
         
     df_score = pd.read_csv(check_score_path, header=None)
     if len(df_score)>2:
         from_dir = 'valid'
         to_dir = 'sub_use'
         df_score.columns = ['feature', 'score']
         df_score.sort_values(by='score', ascending=False, inplace=True)
         best_feature = df_score['feature'].values[0]
         if best_feature.count('_train'):
             best_feature = best_feature.replace('_train', '')
         move_feature([best_feature], from_dir, to_dir)
         os.system(f'rm {check_score_path}')
         os.system(f'touch {check_score_path}')
         
 
 #========================================================================
 # PostProcess
 #========================================================================
 to_dir = '../feature/check_trush/'
 with timer("  * PostProcess"):
     for path in valid_path:
         try:
             shutil.move(path, to_dir)
             shutil.move(path.replace('_train', '_test'), to_dir)
         except FileNotFoundError:
             print(feature_name)
Example #3
0
tmp_train = df_train.join(df_feat_train)
tmp_test = df_test.join(df_feat_test)

#========================================================================
# Train Test で片方に存在しないFeatureを除外
#========================================================================
diff_cols = list(set(tmp_train.columns) - set(tmp_test.columns))
for col in list(set(diff_cols)):
    if col.count('raw'):
        from_dir = 'raw_use'
        to_dir = 'raw_trush'
    else:
        from_dir = 'org_use'
        to_dir = 'org_trush'
    move_feature([col], from_dir, to_dir)
tmp_train.drop(diff_cols, axis=1, inplace=True)

#========================================================================
# GroupKFold
#========================================================================
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
tmp_train[COLUMN_GROUP] = group
# same_user_path = '../output/same_user_pattern/20190901_user_ids_share.csv'
#  same_user_path = '../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv'

model_type = "lgb"
params = {
    'n_jobs': 64,
    #  'n_jobs': 48,
Example #4
0
    tmp_train = df_train.join(df_feat_train)
    tmp_test = df_test.join(df_feat_test)
else:
    tmp_train = df_train
    tmp_test = df_test

#========================================================================
# Train Test で片方に存在しないFeatureを除外
#========================================================================
diff_cols = list(set(tmp_train.columns) - set(tmp_test.columns))

for col in list(set(diff_cols)):
    from_dir = 'valid'
    to_dir = 'valid_trush'
    move_feature([col], from_dir, to_dir)
tmp_train.drop(diff_cols, axis=1, inplace=True)
print(f"  * Diff Features: {len(diff_cols)}")

# same_user_path = '../output/same_user_pattern/20190901_user_ids_share.csv'
# same_user_path = '../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv'
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
tmp_train[COLUMN_GROUP] = group

# 2017-12抜いてみる
# if not has_dec:
#     tmp_train = tmp_train[tmp_train[COLUMN_GROUP]!='2017-12']
#     Y = Y.loc[tmp_train.index]
#     n_splits = 5