def credit_balance(subset_ids=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False, df_name="CCB"): group_vars = ['SK_ID_CURR', 'SK_ID_PREV'] credit_balance = pp.read_dataset_csv(filename="credit_card_balance.csv") if subset_ids != None: credit_balance = credit_balance.loc[credit_balance.SK_ID_CURR.isin( subset_ids)] if not silent: print("Credit Card Balance Shape: {}".format(credit_balance.shape)) # Decrease number of categories credit_balance = pp.join_low_occurance_categories( credit_balance, silent, join_category_name="Other 2") df_name_temp = "" counts_ccb = pp.get_counts_features(credit_balance, group_vars, df_name_temp, group_vars[1]) ccb_agg = pp.get_engineered_features(credit_balance, group_vars, df_name_temp, num_agg_funcs=numeric_agg_funcs) ccb_agg = counts_ccb.merge(ccb_agg, on=group_vars, how='left') ccb_agg_client = pp.agg_numeric(ccb_agg, [group_vars[0]], df_name) return ccb_agg_client
def bureau_balance(subset_ids=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False, df_name="BB"): group_vars = ['SK_ID_CURR', 'SK_ID_BUREAU'] bureau_balance = pp.read_dataset_csv(filename="bureau_balance.csv") bureau = pp.read_dataset_csv(filename="bureau.csv", usecols=group_vars) bureau_balance = bureau.merge(bureau_balance, on='SK_ID_BUREAU', how='inner') del bureau gc.collect() if subset_ids != None: bureau_balance = bureau_balance.loc[bureau_balance.SK_ID_CURR.isin( subset_ids)] if not silent: print("Bureau Balance Shape: {}".format(bureau_balance.shape)) # Decrease number of categories bureau_balance = pp.join_low_occurance_categories( bureau_balance, silent, join_category_name="Other 2") df_name_temp = "" counts_bb = pp.get_counts_features(bureau_balance, group_vars, df_name_temp, group_vars[1]) bb_agg = pp.get_engineered_features(bureau_balance, group_vars, df_name_temp, num_agg_funcs=numeric_agg_funcs) cols_status = [ c for c in bb_agg.columns if c.endswith("_COUNT") and c.find("_STATUS_") != -1 and c not in [ df_name_temp + "_STATUS_X_COUNT", df_name_temp + "_STATUS_C_COUNT", df_name_temp + "_STATUS_0_COUNT" ] ] bb_agg = counts_bb.merge(bb_agg, on=group_vars, how='left') bb_agg[df_name_temp + "_DPD_COUNT"] = bb_agg.loc[:, cols_status].sum(axis=1) bb_agg[df_name_temp + "_DPD_FREQ"] = bb_agg[df_name_temp + "_DPD_COUNT"] / bb_agg[df_name_temp + "_ROWCOUNT"] bb_agg_client = pp.agg_numeric(bb_agg, [group_vars[0]], df_name) return bb_agg_client
def installments_payments(subset_ids=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False, df_name="IP"): group_vars = ['SK_ID_CURR', 'SK_ID_PREV'] installments = pp.read_dataset_csv(filename="installments_payments.csv") if subset_ids != None: installments = installments.loc[installments.SK_ID_CURR.isin( subset_ids)] if not silent: print("Installment Payments Shape: {}".format(installments.shape)) # Percentage and difference paid in each installment (amount paid and installment value) installments['PAYMENT_PERC'] = installments['AMT_PAYMENT'] / installments[ 'AMT_INSTALMENT'] installments['PAYMENT_DIFF'] = installments[ 'AMT_INSTALMENT'] - installments['AMT_PAYMENT'] # Days past due and days before due (no negative values) installments['DPD'] = installments['DAYS_ENTRY_PAYMENT'] - installments[ 'DAYS_INSTALMENT'] installments['DBD'] = installments['DAYS_INSTALMENT'] - installments[ 'DAYS_ENTRY_PAYMENT'] installments['DPD'] = installments['DPD'].apply(lambda x: x if x > 0 else 0) installments['DBD'] = installments['DBD'].apply(lambda x: x if x > 0 else 0) # Decrease number of categories installments = pp.join_low_occurance_categories( installments, silent, join_category_name="Other 2") df_name_temp = "" counts_ip = pp.get_counts_features(installments, group_vars, df_name_temp, group_vars[1]) ip_agg = pp.get_engineered_features(installments, group_vars, df_name_temp, num_agg_funcs=numeric_agg_funcs) ip_agg = counts_ip.merge(ip_agg, on=group_vars, how='left') ip_agg_client = pp.agg_numeric(ip_agg, [group_vars[0]], df_name) return ip_agg_client
def bureau(subset_ids=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False, df_name="BU"): group_var = ['SK_ID_CURR'] bureau = pp.read_dataset_csv(filename="bureau.csv") if subset_ids != None: bureau = bureau.loc[bureau.SK_ID_CURR.isin(subset_ids)] if not silent: print("Bureau Shape: {}".format(bureau.shape)) # Decrease number of categories bureau = pp.join_low_occurance_categories(bureau, silent, join_category_name="Other 2") if (treat_num_missing): if not silent: print("Treating numericals missing...") bureau = pp.handle_missing_median( bureau, pp.get_numerical_missing_cols(bureau), group_by_cols=["CREDIT_TYPE"]) if not silent: print( "Aggregating BUREAU by categories of 'SK_ID_CURR' and 'CREDIT_ACTIVE'..." ) bu_agg_1 = pp.agg_categorical_numeric( bureau, df_name + "1", group_var=['SK_ID_CURR', 'CREDIT_ACTIVE'], num_columns=['DAYS_CREDIT', 'AMT_ANNUITY']) if not silent: print("Aggregating BUREAU by only 'SK_ID_CURR'...") counts = pp.get_counts_features(bureau, group_var, df_name + "2") bu_agg_2 = pp.get_engineered_features(bureau, group_var, df_name + "2", num_agg_funcs=numeric_agg_funcs) bu_agg_2 = counts.merge(bu_agg_2, on=group_var[0], how='left') return bu_agg_1.merge(bu_agg_2, on=group_var[0], how='left')
def previous_application(subset_ids=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False, df_name="PA"): group_var = ['SK_ID_CURR'] previous_application = pp.read_dataset_csv( filename="previous_application.csv") if subset_ids != None: previous_application = previous_application.loc[ previous_application.SK_ID_CURR.isin(subset_ids)] if not silent: print("Previous Application Shape: {}".format( previous_application.shape)) if not silent: print("Deleting columns with high occurance of nulls...") previous_application.drop( ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED'], axis=1, inplace=True) #previous_application.drop(['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'DAYS_FIRST_DRAWING'], axis=1, inplace = True) previous_application.NFLAG_INSURED_ON_APPROVAL.fillna(0, inplace=True) previous_application.loc[:, 'NFLAG_INSURED_ON_APPROVAL'] = previous_application.loc[:, 'NFLAG_INSURED_ON_APPROVAL'].astype( 'int32') previous_application['APP_CREDIT_PERC'] = previous_application[ 'AMT_APPLICATION'] / previous_application['AMT_CREDIT'] # Label Encode previous_application = pp.label_encode(previous_application, silent) # Decrease number of categories previous_application = pp.join_low_occurance_categories( previous_application, silent, join_category_name="Other 2") previous_application.PRODUCT_COMBINATION.fillna("Other 2", inplace=True) previous_application = treat_anomalies(previous_application, columns=[ 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION' ]) if (treat_num_missing): if not silent: print("Treating numericals missing...") print( pp.check_missing(previous_application[ pp.get_numerical_missing_cols(previous_application)])) previous_application = pp.handle_missing_median( previous_application, pp.get_numerical_missing_cols(previous_application), group_by_cols=["NAME_CONTRACT_STATUS"]) if not silent: print( pp.check_missing(previous_application[ pp.get_numerical_missing_cols(previous_application)])) if not silent: print( "Aggregating PREVIOUS APPLICATION by categories of 'SK_ID_CURR' and 'NAME_CONTRACT_STATUS'..." ) pa_agg_1 = pp.agg_categorical_numeric( previous_application, df_name + "1", group_var=['SK_ID_CURR', 'NAME_CONTRACT_STATUS']) if not silent: print("Aggregating PREVIOUS APPLICATION by only 'SK_ID_CURR'...") counts = pp.get_counts_features(previous_application, group_var, df_name + "2") pa_agg_2 = pp.get_engineered_features(previous_application, group_var, df_name + "2", num_agg_funcs=numeric_agg_funcs) pa_agg_2 = counts.merge(pa_agg_2, on=group_var[0], how='left') return pa_agg_1.merge(pa_agg_2, on=group_var[0], how='left')