Esempio n. 1
0
def post_process(df):
    factorize(df)

    # fillna
    df['PREV_AMT_ANNUITY_SUM'].fillna(0, inplace=True)
    df['PREV_AMT_CREDIT_SUM'].fillna(0, inplace=True)
    df['BURE_ACT_AMT_CREDIT_SUM_SUM'].fillna(0, inplace=True)

    # calculate length
    df['ANNUITY_SUM_AP'] = df['AMT_ANNUITY'] + df['PREV_AMT_ANNUITY_SUM']
    df['CREDIT_SUM_AP'] = df['AMT_CREDIT'] + df['PREV_AMT_CREDIT_SUM']
    df['ANNUITY_SUM_LENGTH_AP'] = df['CREDIT_SUM_AP'] / df['ANNUITY_SUM_AP']
    df['DIFF_ANNUITY_AND_INCOME_SUM_AP'] =\
        df['AMT_INCOME_TOTAL'] - df['ANNUITY_SUM_AP']

    df['CREDIT_SUM_AB'] = df['AMT_CREDIT'] + df['BURE_ACT_AMT_CREDIT_SUM_SUM']

    df['ANNUITY_SUM'] = df['AMT_ANNUITY'] + df['PREV_AMT_ANNUITY_SUM']
    df['CREDIT_SUM'] = (df['AMT_CREDIT'] + df['PREV_AMT_CREDIT_SUM'] +
                        df['BURE_ACT_AMT_CREDIT_SUM_SUM'])
    df['ANNUITY_SUM_LENGTH'] = df['CREDIT_SUM'] / df['ANNUITY_SUM']
    df['DIFF_ANNUITY_AND_INCOME_SUM'] =\
        df['AMT_INCOME_TOTAL'] - df['ANNUITY_SUM']

    df['COUNT_NAN'] = df.isnull().sum(axis=1)
    with pd.option_context('mode.use_inf_as_na', True):
        df['COUNT_INF'] = df.isnull().sum(axis=1) - df['COUNT_NAN']
    # TODO: mutate_all(funs(ifelse(is.nan(.), NA, .))) %>%

    return df
Esempio n. 2
0
def load(idx):
    print('merge...')
    cred = []
    for fname in [
            './data/credit.agg.feather',
            './data/credit.last.feather',
            './data/credit.prev.last.feather',
            './data/credit.diff.feather',
            './data/credit.tail.feather',
    ]:
        cred.append(pd.read_feather(fname).set_index('SK_ID_CURR'))
    cred = pd.concat(cred, axis=1)
    app = pd.read_feather('./data/app.agg.feather')
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    test = test.merge(cred, on='SK_ID_CURR')
    test = test.merge(app, on='SK_ID_CURR')
    test['TARGET'] = np.nan
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    train = train.merge(cred, on='SK_ID_CURR')
    train = train.merge(app, on='SK_ID_CURR')
    del cred
    del app

    gc.collect()

    print('post process...')
    df = pd.concat([train, test])
    factorize(df)
    train = df[pd.notnull(df['TARGET'])].reset_index(drop=True)
    test = df[pd.isnull(df['TARGET'])].reset_index(drop=True)
    del df
    gc.collect()

    print('split...')
    fold = './data/fold.{}.feather'.format(idx)
    print('load {}'.format(fold))
    fold = pd.read_feather(fold)
    valid = train[train['SK_ID_CURR'].isin(
        fold['SK_ID_CURR'])].reset_index(drop=True)
    train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index(
        drop=True)
    print('train: {}'.format(train.shape))
    print('valid: {}'.format(valid.shape))
    print('test: {}'.format(test.shape))
    return train, valid, test
def load(idx):
    pos = pd.read_feather('./data/pos.agg.feather')
    for fname in [
            './data/pos.diff.feather',
            './data/pos.tail.feather',
            './data/pos.trend.feather',
            './data/pos.last.feather',
            './data/pos.grp.feather',
    ]:
        print('merge {}...'.format(fname))
        gc.collect()
        pos = pos.merge(pd.read_feather(fname), on='SK_ID_CURR', how='left')

    print(pos.shape)
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    test = test.merge(pos, on='SK_ID_CURR')
    test['TARGET'] = np.nan
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    train = train.merge(pos, on='SK_ID_CURR')
    del pos

    gc.collect()

    print('post process...')
    df = pd.concat([train, test])
    factorize(df)
    train = df[pd.notnull(df['TARGET'])].reset_index(drop=True)
    test = df[pd.isnull(df['TARGET'])].reset_index(drop=True)
    del df
    gc.collect()

    print('split...')
    fold = './data/fold.{}.feather'.format(idx)
    print('load {}'.format(fold))
    fold = pd.read_feather(fold)
    valid = train[train['SK_ID_CURR'].isin(
        fold['SK_ID_CURR'])].reset_index(drop=True)
    train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index(
        drop=True)
    print('train: {}'.format(train.shape))
    print('valid: {}'.format(valid.shape))
    print('test: {}'.format(test.shape))
    return train, valid, test
Esempio n. 4
0
def load(idx):
    print('merge...')
    bure = pd.read_feather('./data/bureau.agg.feather')
    for fname in [
            './data/bureau.active.feather', './data/bureau.closed.feather',
            './data/bureau.grp.feather', './data/app.agg.feather'
    ]:
        bure = bure.merge(pd.read_feather(fname), on='SK_ID_CURR')
    bure = add_bure_features(bure)
    test = pd.read_feather('./data/application_test.preprocessed.feather')
    test = test.merge(bure, on='SK_ID_CURR')
    test['TARGET'] = np.nan
    train = pd.read_feather('./data/application_train.preprocessed.feather')
    train = train.merge(bure, on='SK_ID_CURR')
    del bure

    gc.collect()

    print('post process...')
    df = pd.concat([train, test])
    factorize(df)
    df['BURE_ACT_AMT_CREDIT_SUM_SUM'].fillna(0, inplace=True)
    df['CREDIT_SUM_AB'] = df['AMT_CREDIT'] + df['BURE_ACT_AMT_CREDIT_SUM_SUM']
    train = df[pd.notnull(df['TARGET'])].reset_index(drop=True)
    test = df[pd.isnull(df['TARGET'])].reset_index(drop=True)
    del df
    gc.collect()

    print('split...')
    fold = './data/fold.{}.feather'.format(idx)
    print('load {}'.format(fold))
    fold = pd.read_feather(fold)
    valid = train[train['SK_ID_CURR'].isin(
        fold['SK_ID_CURR'])].reset_index(drop=True)
    train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index(
        drop=True)
    print('train: {}'.format(train.shape))
    print('valid: {}'.format(valid.shape))
    print('test: {}'.format(test.shape))
    return train, valid, test