Ejemplo n.º 1
0
def preprocess_dfs(use_features, is_local=False, logger=None, debug=True):
    # read dataframes
    with timer("read datasets"):
        if debug:
            nrows = 200000
        else:
            nrows = None

        sub = pd.read_csv(base_path + '/sample_submission.csv')

        if is_local:
            org_train = pickle_load("../input/train.pkl")
            org_test = pickle_load("../input/test.pkl")
        else:
            org_train = pd.read_csv(base_path + "/train.csv", nrows=nrows)
            org_test = pd.read_csv(base_path + "/test.csv", nrows=nrows)

        org_train = memory_reducer(org_train, verbose=True)
        org_test = org_test[org_test.installation_id.isin(sub.installation_id)]
        org_test.sort_values(['installation_id', 'timestamp'], inplace=True)
        org_test.reset_index(inplace=True)
        org_test = memory_reducer(org_test, verbose=True)

        train_labels = pd.read_csv(base_path + "/train_labels.csv",
                                   nrows=nrows)
        specs = pd.read_csv(base_path + "/specs.csv", nrows=nrows)

    # basic preprocess
    org_train["timestamp"] = pd.to_datetime(org_train["timestamp"])
    org_test["timestamp"] = pd.to_datetime(org_test["timestamp"])

    with timer("merging features"):
        train_df = add_features(use_features,
                                org_train,
                                org_test,
                                train_labels,
                                specs,
                                datatype="train",
                                is_local=is_local,
                                logger=None)
        train_df = train_df.reset_index(drop=True)
        test_df = add_features(use_features,
                               org_train,
                               org_test,
                               train_labels,
                               specs,
                               datatype="test",
                               is_local=is_local,
                               logger=None)
        test_df = test_df.reset_index(drop=True)


#     df = pd.concat([df, feat_df], axis=1)
    print("preprocess done!!")
    # train_df.to_csv('./scratch_generated_train1.csv', index=False)
    # train_df.to_csv('./generated_train1.csv', index=False)

    return train_df, test_df
Ejemplo n.º 2
0
def feature_maker(feat_cls, is_overwrite, org_train, org_test,
                  train_labels, params, logger, is_local):
    feat_ = feat_cls(train_labels, params, logger)
    feat_name = feat_.name
    datatype = feat_.datatype
    feature_dir = './mnt/inputs/features'
    # feature_dir = os.path.join(os.path.dirname("__file__"), "../feature")
    feature_path = Path(feature_dir) / f"{datatype}" / f"{feat_name}.pkl"

    print(f'feature_path: {feature_path}')
    print(f'os.path.exists(feature_path): {os.path.exists(feature_path)}')
    print(f'is_overwrite: {is_overwrite}')
    if os.path.exists(feature_path) and is_overwrite is False:
        f_df = pickle_load(feature_path)
    else:
        f_df = feat_.feature_extract(org_train, org_test)

    return f_df
Ejemplo n.º 3
0
def add_features(use_features,
                 org_train,
                 org_test,
                 train_labels,
                 specs,
                 datatype,
                 is_local=False,
                 logger=None):
    # 都度計算する
    feat_params = {
        "datatype": datatype,
        "debug": True,
        "is_overwrite": True,
    }

    # base feature
    base_feat = KernelBasics2(train_labels, feat_params, logger)
    feature_dir = './mnt/inputs/features'
    # feature_dir = os.path.join(os.path.dirname("__file__"), "../feature")
    feature_path = Path(feature_dir) / f"{datatype}" / f"{base_feat.name}.pkl"

    if os.path.exists(feature_path):
        feat_df = pickle_load(feature_path)
    else:
        feat_df = base_feat.feature_extract(org_train, org_test)

    feat_df = feat_df.sort_values(['game_session',
                                   'installation_id']).reset_index(drop=True)

    # add event_counts
    for name, feat_condition in use_features.items():
        feat_cls = feat_condition[0]
        is_overwrite = feat_condition[1]

        f_df = feature_maker(feat_cls, is_overwrite, org_train, org_test,
                             train_labels, feat_params, logger, is_local)
        feat_df = pd.merge(feat_df,
                           f_df,
                           how="left",
                           on=["installation_id", "game_session"])
        del f_df

    return feat_df