def post_process(df): factorize(df) # fillna df['PREV_AMT_ANNUITY_SUM'].fillna(0, inplace=True) df['PREV_AMT_CREDIT_SUM'].fillna(0, inplace=True) df['BURE_ACT_AMT_CREDIT_SUM_SUM'].fillna(0, inplace=True) # calculate length df['ANNUITY_SUM_AP'] = df['AMT_ANNUITY'] + df['PREV_AMT_ANNUITY_SUM'] df['CREDIT_SUM_AP'] = df['AMT_CREDIT'] + df['PREV_AMT_CREDIT_SUM'] df['ANNUITY_SUM_LENGTH_AP'] = df['CREDIT_SUM_AP'] / df['ANNUITY_SUM_AP'] df['DIFF_ANNUITY_AND_INCOME_SUM_AP'] =\ df['AMT_INCOME_TOTAL'] - df['ANNUITY_SUM_AP'] df['CREDIT_SUM_AB'] = df['AMT_CREDIT'] + df['BURE_ACT_AMT_CREDIT_SUM_SUM'] df['ANNUITY_SUM'] = df['AMT_ANNUITY'] + df['PREV_AMT_ANNUITY_SUM'] df['CREDIT_SUM'] = (df['AMT_CREDIT'] + df['PREV_AMT_CREDIT_SUM'] + df['BURE_ACT_AMT_CREDIT_SUM_SUM']) df['ANNUITY_SUM_LENGTH'] = df['CREDIT_SUM'] / df['ANNUITY_SUM'] df['DIFF_ANNUITY_AND_INCOME_SUM'] =\ df['AMT_INCOME_TOTAL'] - df['ANNUITY_SUM'] df['COUNT_NAN'] = df.isnull().sum(axis=1) with pd.option_context('mode.use_inf_as_na', True): df['COUNT_INF'] = df.isnull().sum(axis=1) - df['COUNT_NAN'] # TODO: mutate_all(funs(ifelse(is.nan(.), NA, .))) %>% return df
def load(idx): print('merge...') cred = [] for fname in [ './data/credit.agg.feather', './data/credit.last.feather', './data/credit.prev.last.feather', './data/credit.diff.feather', './data/credit.tail.feather', ]: cred.append(pd.read_feather(fname).set_index('SK_ID_CURR')) cred = pd.concat(cred, axis=1) app = pd.read_feather('./data/app.agg.feather') test = pd.read_feather('./data/application_test.preprocessed.feather') test = test.merge(cred, on='SK_ID_CURR') test = test.merge(app, on='SK_ID_CURR') test['TARGET'] = np.nan train = pd.read_feather('./data/application_train.preprocessed.feather') train = train.merge(cred, on='SK_ID_CURR') train = train.merge(app, on='SK_ID_CURR') del cred del app gc.collect() print('post process...') df = pd.concat([train, test]) factorize(df) train = df[pd.notnull(df['TARGET'])].reset_index(drop=True) test = df[pd.isnull(df['TARGET'])].reset_index(drop=True) del df gc.collect() print('split...') fold = './data/fold.{}.feather'.format(idx) print('load {}'.format(fold)) fold = pd.read_feather(fold) valid = train[train['SK_ID_CURR'].isin( fold['SK_ID_CURR'])].reset_index(drop=True) train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index( drop=True) print('train: {}'.format(train.shape)) print('valid: {}'.format(valid.shape)) print('test: {}'.format(test.shape)) return train, valid, test
def load(idx): pos = pd.read_feather('./data/pos.agg.feather') for fname in [ './data/pos.diff.feather', './data/pos.tail.feather', './data/pos.trend.feather', './data/pos.last.feather', './data/pos.grp.feather', ]: print('merge {}...'.format(fname)) gc.collect() pos = pos.merge(pd.read_feather(fname), on='SK_ID_CURR', how='left') print(pos.shape) test = pd.read_feather('./data/application_test.preprocessed.feather') test = test.merge(pos, on='SK_ID_CURR') test['TARGET'] = np.nan train = pd.read_feather('./data/application_train.preprocessed.feather') train = train.merge(pos, on='SK_ID_CURR') del pos gc.collect() print('post process...') df = pd.concat([train, test]) factorize(df) train = df[pd.notnull(df['TARGET'])].reset_index(drop=True) test = df[pd.isnull(df['TARGET'])].reset_index(drop=True) del df gc.collect() print('split...') fold = './data/fold.{}.feather'.format(idx) print('load {}'.format(fold)) fold = pd.read_feather(fold) valid = train[train['SK_ID_CURR'].isin( fold['SK_ID_CURR'])].reset_index(drop=True) train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index( drop=True) print('train: {}'.format(train.shape)) print('valid: {}'.format(valid.shape)) print('test: {}'.format(test.shape)) return train, valid, test
def load(idx): print('merge...') bure = pd.read_feather('./data/bureau.agg.feather') for fname in [ './data/bureau.active.feather', './data/bureau.closed.feather', './data/bureau.grp.feather', './data/app.agg.feather' ]: bure = bure.merge(pd.read_feather(fname), on='SK_ID_CURR') bure = add_bure_features(bure) test = pd.read_feather('./data/application_test.preprocessed.feather') test = test.merge(bure, on='SK_ID_CURR') test['TARGET'] = np.nan train = pd.read_feather('./data/application_train.preprocessed.feather') train = train.merge(bure, on='SK_ID_CURR') del bure gc.collect() print('post process...') df = pd.concat([train, test]) factorize(df) df['BURE_ACT_AMT_CREDIT_SUM_SUM'].fillna(0, inplace=True) df['CREDIT_SUM_AB'] = df['AMT_CREDIT'] + df['BURE_ACT_AMT_CREDIT_SUM_SUM'] train = df[pd.notnull(df['TARGET'])].reset_index(drop=True) test = df[pd.isnull(df['TARGET'])].reset_index(drop=True) del df gc.collect() print('split...') fold = './data/fold.{}.feather'.format(idx) print('load {}'.format(fold)) fold = pd.read_feather(fold) valid = train[train['SK_ID_CURR'].isin( fold['SK_ID_CURR'])].reset_index(drop=True) train = train[~train['SK_ID_CURR'].isin(fold['SK_ID_CURR'])].reset_index( drop=True) print('train: {}'.format(train.shape)) print('valid: {}'.format(valid.shape)) print('test: {}'.format(test.shape)) return train, valid, test