def _create_feature(cls, conf) -> pd.DataFrame: df_pos = PosCash.get_df(conf) # Replace some outliers df_pos.loc[df_pos['CNT_INSTALMENT_FUTURE'] > 60, 'CNT_INSTALMENT_FUTURE'] = np.nan # Some new features df_pos['pos CNT_INSTALMENT more CNT_INSTALMENT_FUTURE'] = \ (df_pos['CNT_INSTALMENT'] > df_pos['CNT_INSTALMENT_FUTURE']).astype(int) # Categorical features with One-Hot encode df_pos, categorical = one_hot_encoder(df_pos) # Aggregations for application set aggregations = {} for col in df_pos.columns: aggregations[col] = ['mean'] if col in categorical else [ 'min', 'max', 'size', 'mean', 'var', 'sum' ] df_pos_agg = df_pos.groupby('SK_ID_CURR').agg(aggregations) df_pos_agg.columns = pd.Index([ 'POS_' + e[0] + "_" + e[1].upper() for e in df_pos_agg.columns.tolist() ]) # Count POS lines df_pos_agg['POS_COUNT'] = df_pos.groupby('SK_ID_CURR').size() del df_pos gc.collect() return df_pos_agg
def _create_feature(cls, conf) -> pd.DataFrame: ins = InstallmentsPayments.get_df(conf) ins, cat_cols = one_hot_encoder(ins, nan_as_category=True) # Percentage and difference paid in each installment (amount paid and installment value) ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT'] ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT'] # Days past due and days before due (no negative values) ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT'] ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT'] ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0) ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0) # Features: Perform aggregations aggregations = { 'NUM_INSTALMENT_VERSION': ['nunique'], 'DPD': ['max', 'mean', 'sum'], 'DBD': ['max', 'mean', 'sum'], 'PAYMENT_PERC': ['mean', 'var'], 'PAYMENT_DIFF': ['mean', 'var'], 'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'], 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'] } for cat in cat_cols: aggregations[cat] = ['mean'] ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations) ins_agg.columns = pd.Index([ 'INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist() ]) # Count installments accounts ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size() del ins gc.collect() return ins_agg
def _create_feature(cls, conf) -> pd.DataFrame: cc = CreditCardBalance.get_df(conf) cc, cat_cols = one_hot_encoder(cc, nan_as_category=True) # General aggregations cc.drop(['SK_ID_PREV'], axis=1, inplace=True) cc_agg = cc.groupby('SK_ID_CURR').agg(['max', 'mean', 'sum', 'var']) cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) # Count credit card lines cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size() del cc gc.collect() return cc_agg
def _create_feature(cls, conf) -> pd.DataFrame: prev = PreviousApplication.get_df(conf) prev, cat_cols = one_hot_encoder(prev, nan_as_category=True) # Add feature: value ask / value received percentage prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT'] # Previous applications numeric features num_aggregations = { 'AMT_ANNUITY': ['max', 'mean'], 'AMT_APPLICATION': ['max', 'mean'], 'AMT_CREDIT': ['max', 'mean'], 'APP_CREDIT_PERC': ['max', 'mean'], 'AMT_DOWN_PAYMENT': ['max', 'mean'], 'AMT_GOODS_PRICE': ['max', 'mean'], 'HOUR_APPR_PROCESS_START': ['max', 'mean'], 'RATE_DOWN_PAYMENT': ['max', 'mean'], 'DAYS_DECISION': ['max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'], } # Previous applications categorical features cat_aggregations = {} for cat in cat_cols: cat_aggregations[cat] = ['mean'] prev_agg = prev.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) prev_agg.columns = pd.Index([ 'PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist() ]) # Previous Applications: Approved Applications - only numerical features approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1] approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations) approved_agg.columns = pd.Index([ 'APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist() ]) prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR') # Previous Applications: Refused Applications - only numerical features refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1] refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations) refused_agg.columns = pd.Index([ 'REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist() ]) prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR') del refused, refused_agg, approved, approved_agg, prev gc.collect() return prev_agg
def _create_feature(cls, conf) -> pd.DataFrame: df_ins = InstallmentsPayments.get_df(conf) # Replace some outliers df_ins.loc[df_ins['NUM_INSTALMENT_VERSION'] > 70, 'NUM_INSTALMENT_VERSION'] = np.nan df_ins.loc[df_ins['DAYS_ENTRY_PAYMENT'] < -4000, 'DAYS_ENTRY_PAYMENT'] = np.nan # Some new features df_ins['ins DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT'] = df_ins[ 'DAYS_ENTRY_PAYMENT'] - df_ins['DAYS_INSTALMENT'] df_ins['ins NUM_INSTALMENT_NUMBER_100'] = ( df_ins['NUM_INSTALMENT_NUMBER'] == 100).astype(int) df_ins['ins DAYS_INSTALMENT more NUM_INSTALMENT_NUMBER'] = ( df_ins['DAYS_INSTALMENT'] > df_ins['NUM_INSTALMENT_NUMBER'] * 50 / 3 - 11500 / 3).astype(int) df_ins['ins AMT_INSTALMENT - AMT_PAYMENT'] = df_ins[ 'AMT_INSTALMENT'] - df_ins['AMT_PAYMENT'] df_ins['ins AMT_PAYMENT / AMT_INSTALMENT'] = df_ins[ 'AMT_PAYMENT'] / df_ins['AMT_INSTALMENT'] # Categorical features with One-Hot encode df_ins, categorical = one_hot_encoder(df_ins) # Aggregations for application set aggregations = {} for col in df_ins.columns: aggregations[col] = ['mean'] if col in categorical else [ 'min', 'max', 'size', 'mean', 'var', 'sum' ] df_ins_agg = df_ins.groupby('SK_ID_CURR').agg(aggregations) df_ins_agg.columns = pd.Index([ 'INS_' + e[0] + "_" + e[1].upper() for e in df_ins_agg.columns.tolist() ]) # Count installments lines df_ins_agg['INSTAL_COUNT'] = df_ins.groupby('SK_ID_CURR').size() del df_ins gc.collect() return df_ins_agg
def _create_feature(cls, conf) -> pd.DataFrame: df_card = CreditCardBalance.get_df(conf) # Replace some outliers df_card.loc[df_card['AMT_PAYMENT_CURRENT'] > 4000000, 'AMT_PAYMENT_CURRENT'] = np.nan df_card.loc[df_card['AMT_CREDIT_LIMIT_ACTUAL'] > 1000000, 'AMT_CREDIT_LIMIT_ACTUAL'] = np.nan # Some new features df_card['card missing'] = df_card.isnull().sum(axis = 1).values df_card['card SK_DPD - MONTHS_BALANCE'] = df_card['SK_DPD'] - df_card['MONTHS_BALANCE'] df_card['card SK_DPD_DEF - MONTHS_BALANCE'] = df_card['SK_DPD_DEF'] - df_card['MONTHS_BALANCE'] df_card['card SK_DPD - SK_DPD_DEF'] = df_card['SK_DPD'] - df_card['SK_DPD_DEF'] df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECIVABLE'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card['AMT_RECIVABLE'] df_card['card AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_TOTAL_RECEIVABLE'] - df_card['AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_RECIVABLE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_RECIVABLE'] - df_card['AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_BALANCE - AMT_RECIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_RECIVABLE'] df_card['card AMT_BALANCE - AMT_RECEIVABLE_PRINCIPAL'] = df_card['AMT_BALANCE'] - df_card['AMT_RECEIVABLE_PRINCIPAL'] df_card['card AMT_BALANCE - AMT_TOTAL_RECEIVABLE'] = df_card['AMT_BALANCE'] - df_card['AMT_TOTAL_RECEIVABLE'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_ATM_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card['AMT_DRAWINGS_ATM_CURRENT'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_OTHER_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card['AMT_DRAWINGS_OTHER_CURRENT'] df_card['card AMT_DRAWINGS_CURRENT - AMT_DRAWINGS_POS_CURRENT'] = df_card['AMT_DRAWINGS_CURRENT'] - df_card['AMT_DRAWINGS_POS_CURRENT'] # Categorical features with One-Hot encode df_card, categorical = one_hot_encoder(df_card) # Aggregations for application set aggregations = {} for col in df_card.columns: aggregations[col] = ['mean'] if col in categorical else ['min', 'max', 'size', 'mean', 'var', 'sum'] df_card_agg = df_card.groupby('SK_ID_CURR').agg(aggregations) df_card_agg.columns = pd.Index(['CARD_' + e[0] + "_" + e[1].upper() for e in df_card_agg.columns.tolist()]) # Count credit card lines df_card_agg['CARD_COUNT'] = df_card.groupby('SK_ID_CURR').size() del df_card gc.collect() return df_card_agg
def _create_feature(cls, conf) -> pd.DataFrame: pos = PosCash.get_df(conf) pos, cat_cols = one_hot_encoder(pos, nan_as_category=True) # Features aggregations = { 'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean'] } for cat in cat_cols: aggregations[cat] = ['mean'] pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations) pos_agg.columns = pd.Index([ 'POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist() ]) # Count pos cash accounts pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size() del pos gc.collect() return pos_agg
def _create_feature(cls, conf) -> pd.DataFrame: df_bureau_b = BureauBalance.get_df(conf) # Some new features in bureau_balance set tmp = df_bureau_b[['SK_ID_BUREAU', 'STATUS']].groupby('SK_ID_BUREAU') tmp_last = tmp.last() tmp_last.columns = ['First_status'] df_bureau_b = df_bureau_b.join(tmp_last, how='left', on='SK_ID_BUREAU') tmp_first = tmp.first() tmp_first.columns = ['Last_status'] df_bureau_b = df_bureau_b.join(tmp_first, how='left', on='SK_ID_BUREAU') del tmp, tmp_first, tmp_last gc.collect() tmp = df_bureau_b[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').last() tmp = tmp.apply(abs) tmp.columns = ['Month'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') del tmp gc.collect() tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == 'C', ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \ .groupby('SK_ID_BUREAU').last() tmp = tmp.apply(abs) tmp.columns = ['When_closed'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') del tmp gc.collect() df_bureau_b['Month_closed_to_end'] = df_bureau_b[ 'Month'] - df_bureau_b['When_closed'] for c in range(6): tmp = df_bureau_b.loc[df_bureau_b['STATUS'] == str(c), ['SK_ID_BUREAU', 'MONTHS_BALANCE']] \ .groupby('SK_ID_BUREAU').count() tmp.columns = ['DPD_' + str(c) + '_cnt'] df_bureau_b = df_bureau_b.join(tmp, how='left', on='SK_ID_BUREAU') df_bureau_b['DPD_' + str(c) + ' / Month'] = df_bureau_b[ 'DPD_' + str(c) + '_cnt'] / df_bureau_b['Month'] del tmp gc.collect() df_bureau_b['Non_zero_DPD_cnt'] = df_bureau_b[[ 'DPD_1_cnt', 'DPD_2_cnt', 'DPD_3_cnt', 'DPD_4_cnt', 'DPD_5_cnt' ]].sum(axis=1) df_bureau_b, bureau_b_cat = one_hot_encoder(df_bureau_b) # Bureau balance: Perform aggregations aggregations = {} for col in df_bureau_b.columns: aggregations[col] = ['mean'] if col in bureau_b_cat else [ 'min', 'max', 'size' ] df_bureau_b_agg = df_bureau_b.groupby('SK_ID_BUREAU').agg(aggregations) df_bureau_b_agg.columns = pd.Index([ e[0] + "_" + e[1].upper() for e in df_bureau_b_agg.columns.tolist() ]) del df_bureau_b gc.collect() df_bureau = Bureau.get_df(conf) # Replace\remove some outliers in bureau set df_bureau.loc[df_bureau['AMT_ANNUITY'] > .8e8, 'AMT_ANNUITY'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_SUM'] > 3e8, 'AMT_CREDIT_SUM'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_SUM_DEBT'] > 1e8, 'AMT_CREDIT_SUM_DEBT'] = np.nan df_bureau.loc[df_bureau['AMT_CREDIT_MAX_OVERDUE'] > .8e8, 'AMT_CREDIT_MAX_OVERDUE'] = np.nan df_bureau.loc[df_bureau['DAYS_ENDDATE_FACT'] < -10000, 'DAYS_ENDDATE_FACT'] = np.nan df_bureau.loc[(df_bureau['DAYS_CREDIT_UPDATE'] > 0) | (df_bureau['DAYS_CREDIT_UPDATE'] < -40000), 'DAYS_CREDIT_UPDATE'] = np.nan df_bureau.loc[df_bureau['DAYS_CREDIT_ENDDATE'] < -10000, 'DAYS_CREDIT_ENDDATE'] = np.nan df_bureau.drop(df_bureau[ df_bureau['DAYS_ENDDATE_FACT'] < df_bureau['DAYS_CREDIT']].index, inplace=True) # Some new features in bureau set df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_DEBT'] = df_bureau[ 'AMT_CREDIT_SUM'] - df_bureau['AMT_CREDIT_SUM_DEBT'] df_bureau['bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_LIMIT'] = df_bureau[ 'AMT_CREDIT_SUM'] - df_bureau['AMT_CREDIT_SUM_LIMIT'] df_bureau[ 'bureau AMT_CREDIT_SUM - AMT_CREDIT_SUM_OVERDUE'] = df_bureau[ 'AMT_CREDIT_SUM'] - df_bureau['AMT_CREDIT_SUM_OVERDUE'] df_bureau['bureau DAYS_CREDIT - CREDIT_DAY_OVERDUE'] = df_bureau[ 'DAYS_CREDIT'] - df_bureau['CREDIT_DAY_OVERDUE'] df_bureau['bureau DAYS_CREDIT - DAYS_CREDIT_ENDDATE'] = df_bureau[ 'DAYS_CREDIT'] - df_bureau['DAYS_CREDIT_ENDDATE'] df_bureau['bureau DAYS_CREDIT - DAYS_ENDDATE_FACT'] = df_bureau[ 'DAYS_CREDIT'] - df_bureau['DAYS_ENDDATE_FACT'] df_bureau[ 'bureau DAYS_CREDIT_ENDDATE - DAYS_ENDDATE_FACT'] = df_bureau[ 'DAYS_CREDIT_ENDDATE'] - df_bureau['DAYS_ENDDATE_FACT'] df_bureau[ 'bureau DAYS_CREDIT_UPDATE - DAYS_CREDIT_ENDDATE'] = df_bureau[ 'DAYS_CREDIT_UPDATE'] - df_bureau['DAYS_CREDIT_ENDDATE'] # Categorical features with One-Hot encode df_bureau, bureau_cat = one_hot_encoder(df_bureau) # Bureau balance: merge with bureau.csv df_bureau = df_bureau.join(df_bureau_b_agg, how='left', on='SK_ID_BUREAU') df_bureau.drop('SK_ID_BUREAU', axis=1, inplace=True) del df_bureau_b_agg gc.collect() # Bureau and bureau_balance aggregations for application set categorical = bureau_cat + bureau_b_cat aggregations = {} for col in df_bureau.columns: aggregations[col] = ['mean'] if col in categorical else [ 'min', 'max', 'size', 'mean', 'var', 'sum' ] df_bureau_agg = df_bureau.groupby('SK_ID_CURR').agg(aggregations) df_bureau_agg.columns = pd.Index([ 'BURO_' + e[0] + "_" + e[1].upper() for e in df_bureau_agg.columns.tolist() ]) # Bureau: Active credits active_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Active'] == 1].groupby( 'SK_ID_CURR').agg(aggregations) active_agg.columns = pd.Index([ 'ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist() ]) df_bureau_agg = df_bureau_agg.join(active_agg, how='left') del active_agg gc.collect() # Bureau: Closed credits closed_agg = df_bureau[df_bureau['CREDIT_ACTIVE_Closed'] == 1].groupby( 'SK_ID_CURR').agg(aggregations) closed_agg.columns = pd.Index([ 'CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist() ]) df_bureau_agg = df_bureau_agg.join(closed_agg, how='left') del closed_agg, df_bureau gc.collect() return df_bureau_agg
def _create_feature(cls, conf) -> pd.DataFrame: bureau = Bureau.get_df(conf) bureau, bureau_cat = one_hot_encoder(bureau, True) bb = BureauBalance.get_df(conf) bb, bb_cat = one_hot_encoder(bb, True) # Bureau balance: Perform aggregations and merge with bureau.csv bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']} for col in bb_cat: bb_aggregations[col] = ['mean'] bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) bb_agg.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]) bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU') bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True) del bb, bb_agg gc.collect() # Bureau and bureau_balance numeric features num_aggregations = { 'DAYS_CREDIT': ['mean', 'var'], 'DAYS_CREDIT_ENDDATE': ['mean'], 'DAYS_CREDIT_UPDATE': ['mean'], 'CREDIT_DAY_OVERDUE': ['mean'], 'AMT_CREDIT_MAX_OVERDUE': ['mean'], 'AMT_CREDIT_SUM': ['mean', 'sum'], 'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'], 'AMT_CREDIT_SUM_OVERDUE': ['mean'], 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 'AMT_ANNUITY': ['max', 'mean'], 'CNT_CREDIT_PROLONG': ['sum'], 'MONTHS_BALANCE_MIN': ['min'], 'MONTHS_BALANCE_MAX': ['max'], 'MONTHS_BALANCE_SIZE': ['mean', 'sum'] } # Bureau and bureau_balance categorical features cat_aggregations = {} for cat in bureau_cat: cat_aggregations[cat] = ['mean'] for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean'] bureau_agg = bureau.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) bureau_agg.columns = pd.Index([ 'BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist() ]) # Bureau: Active credits - using only numerical aggregations active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations) active_agg.columns = pd.Index([ 'ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR') del active, active_agg gc.collect() # Bureau: Closed credits - using only numerical aggregations closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations) closed_agg.columns = pd.Index([ 'CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR') del closed, closed_agg, bureau gc.collect() return bureau_agg
def _create_feature(cls, conf) -> pd.DataFrame: df_prev = PreviousApplication.get_df(conf) # Replace some outliers df_prev.loc[df_prev['AMT_CREDIT'] > 6000000, 'AMT_CREDIT'] = np.nan df_prev.loc[df_prev['SELLERPLACE_AREA'] > 3500000, 'SELLERPLACE_AREA'] = np.nan df_prev[[ 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION' ]].replace(365243, np.nan, inplace=True) # Some new features df_prev['prev missing'] = df_prev.isnull().sum(axis=1).values df_prev['prev AMT_APPLICATION / AMT_CREDIT'] = df_prev[ 'AMT_APPLICATION'] / df_prev['AMT_CREDIT'] df_prev['prev AMT_APPLICATION - AMT_CREDIT'] = df_prev[ 'AMT_APPLICATION'] - df_prev['AMT_CREDIT'] df_prev['prev AMT_APPLICATION - AMT_GOODS_PRICE'] = df_prev[ 'AMT_APPLICATION'] - df_prev['AMT_GOODS_PRICE'] df_prev['prev AMT_GOODS_PRICE - AMT_CREDIT'] = df_prev[ 'AMT_GOODS_PRICE'] - df_prev['AMT_CREDIT'] df_prev['prev DAYS_FIRST_DRAWING - DAYS_FIRST_DUE'] = df_prev[ 'DAYS_FIRST_DRAWING'] - df_prev['DAYS_FIRST_DUE'] df_prev['prev DAYS_TERMINATION less -500'] = ( df_prev['DAYS_TERMINATION'] < -500).astype(int) # Categorical features with One-Hot encode df_prev, categorical = one_hot_encoder(df_prev) # Aggregations for application set aggregations = {} for col in df_prev.columns: aggregations[col] = ['mean'] if col in categorical else [ 'min', 'max', 'size', 'mean', 'var', 'sum' ] df_prev_agg = df_prev.groupby('SK_ID_CURR').agg(aggregations) df_prev_agg.columns = pd.Index([ 'PREV_' + e[0] + "_" + e[1].upper() for e in df_prev_agg.columns.tolist() ]) # Previous Applications: Approved Applications approved_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Approved'] == 1].groupby('SK_ID_CURR').agg(aggregations) approved_agg.columns = pd.Index([ 'APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist() ]) df_prev_agg = df_prev_agg.join(approved_agg, how='left') del approved_agg gc.collect() # Previous Applications: Refused Applications refused_agg = df_prev[df_prev['NAME_CONTRACT_STATUS_Refused'] == 1].groupby('SK_ID_CURR').agg(aggregations) refused_agg.columns = pd.Index([ 'REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist() ]) df_prev_agg = df_prev_agg.join(refused_agg, how='left') del refused_agg, df_prev gc.collect() return df_prev_agg
def _create_feature(cls, conf) -> pd.DataFrame: df = Application.get_df(conf) df = ApplicationFeatures._features_from_kernel(df) df = ApplicationFeatures._binarize_features(df) df, _cat_cols = one_hot_encoder(df, True) return ApplicationFeatures._filter_features(df)