Beispiel #1
0
def get_train_test(feat_path_list, base=[], target='target'):
    print(base.shape)
    feature_list = utils.parallel_load_data(path_list=feat_path_list)
    df_feat = pd.concat(feature_list, axis=1)
    df_feat = pd.concat([base, df_feat], axis=1)
    train = df_feat[~df_feat[target].isnull()].reset_index(drop=True)
    test = df_feat[df_feat[target].isnull()].reset_index(drop=True)

    return train, test
Beispiel #2
0
save_file_path = '../output/valid_single_feature.csv'
check_score_path = 'check_score.csv'

COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
#  paths_train += sorted(glob('../feature/sub_use/*_train.gz'))
#  paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group


#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
Beispiel #3
0
    if path.count(''):
        return True
    else:
        return False


paths_train = glob('../submit/re_sub/*_train.gz')
paths_test = glob('../submit/re_sub/*_test.gz')
# paths_train += glob('../submit/re_sub/Tran*_train.gz')
# paths_test  += glob('../submit/re_sub/Tran*_test.gz')
# paths_train += glob('../submit/re_sub/is*_train.gz')
# paths_test  += glob('../submit/re_sub/is*_test.gz')

print(len(paths_train))

df_train = parallel_load_data(paths_train)
df_test = parallel_load_data(paths_test)

### DT-M
group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group

Y = df_train[COLUMN_TARGET]

is_submit = [True, False][0]
n_splits = 6
set_type = 'new_set'

tmp_train = df_train
tmp_test = df_test