Ejemplo n.º 1
0
def get_dataset(base, model_no):
    win_path = f'../features/4_winner/*.gz'
    #  win_path = f'../features/1_first_valid/*.gz'
    model_path_list = [
        f'../model/LB3670_70leaves_colsam0322/*.gz',
        '../model/E2_lift_set/*.gz', '../model/E3_PCA_set/*.gz',
        '../model/E4_mix_set/*.gz', '../model/LB3669LB_70leaves/*.gz'
    ][model_no]
    model_path = model_path_list[model_no]
    tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') + glob.glob(
        f'../features/0_exp/*.gz')
    #  tmp_path_list = glob.glob(f'../features/5_tmp/*.gz')
    win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + glob.glob(win_path)
    win_path_list = glob.glob(win_path) + tmp_path_list
    #  win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list
    #========================================================================

    feature_list = utils.parallel_load_data(path_list=win_path_list)
    df_feat = pd.concat(feature_list, axis=1)
    base = pd.concat([base, df_feat], axis=1)

    train = base[~base[target].isnull()]
    test = base[base[target].isnull()]

    if debug:
        train = train.head(10000)
        test = test.head(1000)

    for col in train.columns:
        if col in ignore_list:
            continue
        train[col] = utils.impute_feature(df=train, col=col)
        test[col] = utils.impute_feature(df=test, col=col)

    return train, test
test = pd.concat(
    [base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

if out_part == 'no_out':
    train = train[train[target] > -30]
#========================================================================

#========================================================================
# 正規化の前処理(Null埋め, inf, -infの処理)
for col in train.columns:
    if col in ignore_list: continue

    train[col] = impute_feature(train, col)
    test[col] = impute_feature(test, col)
#========================================================================

# #========================================================================
# # inf check
# length = len(train)
# for col in train.columns:
#     tmp = train[col].dropna().shape[0]
#     if length - tmp>0:
#         print(col)

#     inf_max = train[col].max()
#     inf_min = train[col].min()
#     if inf_max==np.inf or inf_min==-np.inf:
#         print(col, inf_max, inf_min)
Ejemplo n.º 3
0
    max_freq = list(train[col].value_counts().index)[0]
    train[col].fillna(max_freq, inplace=True)
    test[col].fillna(max_freq, inplace=True)
    le = LabelEncoder().fit(
        pd.concat([train[col], test[col]],
                  axis=0).value_counts().index.tolist())
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
#========================================================================

#========================================================================
# 正規化の前処理(Null埋め, inf, -infの処理)
for col in train.columns:
    if col in ignore_list: continue

    train[col] = utils.impute_feature(train, col)
    test[col] = utils.impute_feature(test, col)
#========================================================================

#========================================================================
# 正規化
from sklearn.preprocessing import StandardScaler

train_test = pd.concat([train, test], axis=0)
base_train = train_test[~train_test[target].isnull()]
base_test = train_test[train_test[target].isnull()]

use_cols = [col for col in train.columns if col not in ignore_list]
scaler = StandardScaler()
scaler.fit(train_test[use_cols])