def previous_refused_amt(df, origin, feat): temp = origin.loc[origin.NAME_CONTRACT_STATUS == 'Refused',feat].astype(np.float64) temp['amt_min_year'] = temp.AMT_CREDIT/(1 + temp.AMT_ANNUITY) temp['amt_credit_trust'] = temp.AMT_CREDIT - temp.AMT_APPLICATION temp['amt_credit_ratio'] = temp.AMT_CREDIT/(1 + temp.AMT_APPLICATION) temp['amt_goods_ratio'] = temp.AMT_CREDIT/(1 + temp.AMT_GOODS_PRICE) temp['amt_goods_remain'] = temp.AMT_CREDIT - temp.AMT_GOODS_PRICE temp = temp.groupby(['SK_ID_CURR']).agg({ k:['min','median','max','mean','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV']}) temp.columns = ['prev_Refused_' + "_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace = True) temp = correlation_reduce(temp) df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left') return df
def install_prelong(df, origin): temp = origin.sort_values(['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']).drop_duplicates( ['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_VERSION']) temp['pay_long'] = temp[['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']].groupby(['SK_ID_CURR','SK_ID_PREV']).shift(-1) temp.pay_long = temp.pay_long - temp.NUM_INSTALMENT_NUMBER temp.pay_long.fillna(1, inplace = True) temp['pay_long_amount'] = temp.AMT_PAYMENT * temp.pay_long temp1 = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg( {k:['mean','sum','min','median','max','std'] for k in ['pay_long','pay_long_amount']}) temp1.columns = ["ins" + "_" + col1 + "_" + col2 for col1 in temp1.columns.levels[0] for col2 in temp1.columns.levels[1]] temp1.reset_index(inplace = True) temp1 = correlation_reduce(temp1) df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left') return df
def bureau_overdue(df, origin): temp = origin.groupby(['SK_ID_CURR', 'SK_ID_BUREAU']).agg({"overdue": ['mean', 'sum']}) temp.columns = ["_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace=True) temp = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg({ k: ['mean', 'max', 'std', 'sum', 'median'] for k in temp.columns if k not in ['SK_ID_CURR', 'SK_ID_BUREAU'] }) temp.columns = ['bureau_' + "_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace=True) temp = correlation_reduce(temp) df = pd.merge(df, temp, on=['SK_ID_CURR'], how='left') return df
def bureau_trend(df, bureau): features = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()}) temp = bureau.loc[~(bureau.AMT_CREDIT_SUM.isnull()) & ~(bureau.AMT_CREDIT_SUM_DEBT.isnull()) & (bureau.AMT_CREDIT_SUM != 0) & (bureau.AMT_CREDIT_SUM_DEBT != 0), ['AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']] lr = LinearRegression() lr.fit(temp.AMT_CREDIT_SUM.values.reshape(-1, 1), temp.AMT_CREDIT_SUM_DEBT) bureau['AMT_CREDIT_SUM_DEBT'] = list( map(lambda x, y: lr.predict(x)[0] if np.isnan(y) else y, bureau.AMT_CREDIT_SUM, bureau.AMT_CREDIT_SUM_DEBT)) bureau['AMT_CREDIT_DEBT_RATE'] = list( map(lambda x, y: safe_div(x, y), bureau.AMT_CREDIT_SUM_DEBT, bureau.AMT_CREDIT_SUM)) bureau[ 'AMT_CREDIT_DEBT_REMAIN'] = bureau.AMT_CREDIT_SUM - bureau.AMT_CREDIT_SUM_DEBT temp = bureau.loc[~(bureau.DAYS_CREDIT.isnull()) & ~(bureau.DAYS_CREDIT_ENDDATE.isnull()), ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE']] temp = temp.groupby(['DAYS_CREDIT' ]).DAYS_CREDIT_ENDDATE.mean().reset_index() lr_days = LinearRegression() lr_days.fit(temp.DAYS_CREDIT.values.reshape(-1, 1), temp.DAYS_CREDIT_ENDDATE.values) bureau['DAYS_CREDIT_ENDDATE'] = list( map(lambda x, y: lr_days.predict(x)[0] if np.isnan(y) else y, bureau.DAYS_CREDIT, bureau.DAYS_CREDIT_ENDDATE)) bureau[ 'CREDIT_DAYS_LONG'] = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT with timer("trend feature:"): features = bureau_trend_features(features, bureau) with timer("basic stat"): temp = bureau.groupby(['SK_ID_CURR']).agg({ k: ['mean', 'min', 'max', 'median', 'sum', 'std'] for k in [ 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_DEBT_RATE', 'AMT_CREDIT_DEBT_REMAIN', 'DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'CREDIT_DAYS_LONG' ] }) temp.columns = [ "bureau_trend_perspective_" + "_".join(j) for j in temp.columns.ravel() ] temp.reset_index(inplace=True) features = pd.merge(features, temp, on=['SK_ID_CURR'], how='left') features = correlation_reduce(features) df = pd.merge(df, features, on=['SK_ID_CURR'], how='left') return df
def card_amt_total_payment(df, origin, feat): temp = origin.loc[origin.AMT_PAYMENT_TOTAL_CURRENT != 0, feat] temp['AMT_RECEIVABLE_DIFF'] = temp.AMT_RECIVABLE - temp.AMT_RECEIVABLE_PRINCIPAL temp['AMT_BALANCE_DIFF'] = temp.AMT_CREDIT_LIMIT_ACTUAL - temp.AMT_BALANCE temp['AMT_DRAWINGS_DIFF'] = temp.AMT_CREDIT_LIMIT_ACTUAL - temp.AMT_DRAWINGS_CURRENT #ratio variable temp['AMT_CREDIT_RATIO'] = temp.AMT_DRAWINGS_CURRENT/(1 + temp.AMT_CREDIT_LIMIT_ACTUAL) temp['AMT_BALANCE_RATIO'] = temp.AMT_BALANCE/(1 + temp.AMT_CREDIT_LIMIT_ACTUAL) temp['AMT_RECEIVABLE_RATIO'] = temp.AMT_RECIVABLE/(1 + temp.AMT_BALANCE) temp = temp.groupby(['SK_ID_CURR','SK_ID_PREV']).agg({ k:['mean','min','max','median','sum','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV']}) temp = correlation_reduce(temp) temp.columns = ["_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace = True) temp = temp.groupby(['SK_ID_CURR']).agg({ k:['mean','min','max','median','sum','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV']}) temp.columns = ["card_using_" + "_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace = True) temp = correlation_reduce(temp) df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left') return df
def install_advance(df, origin): temp1 = origin.groupby(['SK_ID_CURR']).agg( {k:['sum','mean','median','min','max','std',iqr,skew,kurtosis] for k in ['instalment_paid_late_in_days','instalment_paid_over_amount']}) temp2 = origin.groupby(['SK_ID_CURR']).agg( {k:['sum','mean','count'] for k in ['instalment_paid_late','instalment_paid_over']} ) temp1.columns = ["_".join(j) for j in temp1.columns.ravel()] temp2.columns = ["_".join(j) for j in temp2.columns.ravel()] temp2 = correlation_reduce(temp2) temp1.reset_index(inplace = True) temp2.reset_index(inplace = True) df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left') df = pd.merge(df, temp2, on = ['SK_ID_CURR'], how = 'left') return df
def previous_refused_day(df, origin, feat): temp = origin.loc[origin.NAME_CONTRACT_STATUS == 'Refused', feat].astype(np.float64) temp['days_due_diff'] = temp.DAYS_LAST_DUE - temp.DAYS_FIRST_DUE temp['days_last_due_diff'] = temp.DAYS_LAST_DUE_1ST_VERSION - temp.DAYS_LAST_DUE temp['days_waiting_diff'] = temp.DAYS_FIRST_DUE - temp.DAYS_DECISION temp['days_origin_due_diff'] = temp.DAYS_LAST_DUE_1ST_VERSION - temp.DAYS_FIRST_DUE temp['days_origin_que_diff'] = temp.DAYS_TERMINATION - temp.DAYS_LAST_DUE temp = temp.groupby(['SK_ID_CURR']).agg({ k:['min','median','max','mean','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV'] }) temp.columns = ['prev_Refused_' + "_".join(j) for j in temp.columns.ravel()] temp.reset_index(inplace = True) temp = correlation_reduce(temp) df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left') return df
def bureau_closed(df, origin, feat): temp = origin.loc[origin.CREDIT_ACTIVE != 'Active', feat] temp.fillna(0, inplace=True) temp['AMT_CREDIT_DEBT_RATE'] = temp.AMT_CREDIT_SUM_DEBT / ( 1 + temp.AMT_CREDIT_SUM) temp = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg({ k: ['min', 'median', 'max', 'mean', 'sum', 'std'] for k in temp.columns if k not in ['DAYS_CREDIT_UPDATE'] + ['SK_ID_CURR', 'SK_ID_BUREAU'] }) temp.columns = [ "credit_closed_" + "_".join(j) for j in temp.columns.ravel() ] temp.reset_index(inplace=True) temp = correlation_reduce(temp) df = pd.merge(df, temp, on=['SK_ID_CURR'], how='left') return df
def previous_last_k_contract(df, previous_application): numbers_of_applications = [1, 3, 5] features = pd.DataFrame( {'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()}) prev_applications_sorted = previous_application.sort_values( ['SK_ID_CURR', 'DAYS_DECISION']) for number in numbers_of_applications: prev_applications_tail = prev_applications_sorted.groupby( by=['SK_ID_CURR']).tail(number) group_object = prev_applications_tail.groupby( by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index() group_object.rename( index=str, columns={ 'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format( number) }, inplace=True) features = features.merge(group_object, on=['SK_ID_CURR'], how='left') group_object = prev_applications_tail.groupby( by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index() group_object.rename( index=str, columns={ 'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean' .format(number) }, inplace=True) features = features.merge(group_object, on=['SK_ID_CURR'], how='left') group_object = prev_applications_tail.groupby( by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index() group_object.rename( index=str, columns={ 'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'. format(number) }, inplace=True) features = features.merge(group_object, on=['SK_ID_CURR'], how='left') features = correlation_reduce(features) features.fillna(0, inplace=True) df = pd.merge(df, features, on=['SK_ID_CURR'], how='left') return df
def card_last_two_year(df, origin): temp = origin.loc[(origin.MONTHS_BALANCE >= -24) & (origin.NAME_CONTRACT_STATUS == 'Active'), ['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL','AMT_DRAWINGS_CURRENT', 'AMT_PAYMENT_CURRENT','AMT_RECEIVABLE_PRINCIPAL', 'CNT_DRAWINGS_CURRENT','SK_DPD','SK_DPD_DEF']].sort_values(['SK_ID_CURR','MONTHS_BALANCE']) temp['amt_balance_ratio'] = temp.AMT_BALANCE/(temp.AMT_CREDIT_LIMIT_ACTUAL + 1) temp1 = temp.astype(np.float64).groupby(['SK_ID_CURR','SK_ID_PREV']).agg( {k:['min','median','max','mean','std','sum'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE']}) temp1.columns = ["_".join(j) for j in temp1.columns.ravel()] temp1.reset_index(inplace = True) temp1 = temp1.groupby(['SK_ID_CURR']).agg({ k:['min','max','mean','median','std','sum'] for k in temp1.columns if k not in ['SK_ID_CURR','SK_ID_PREV']}) temp1.columns = ["card_2_year_" + "_".join(j) for j in temp1.columns.ravel()] temp1 = correlation_reduce(temp1) temp1.reset_index(inplace = True) df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left') return df
def previous_feature(df, Debug=False): if Debug: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev = prev.sample(10000) else: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev_main = df[['SK_ID_CURR']] key = ['SK_ID_CURR', 'SK_ID_PREV'] Behaviour_variable = [ 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'CNT_PAYMENT' ] Days_variable = [ 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'DAYS_DECISION' ] Categorical_variable = [ 'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION' ] prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) with timer("previous application missiong count analysis."): prev_main = previous_missing(prev_main, prev, Behaviour_variable + Days_variable) with timer("previous all record analysis for amt variable."): prev_main = previous_all_stat_amt(prev_main, prev, key + Behaviour_variable) with timer("previous all record analysis for day variable."): prev_main = previous_all_stat_day(prev_main, prev, key + Days_variable) with timer("previous approved analysis for amt variable"): prev_main = previous_approved_amt(prev_main, prev, key + Behaviour_variable) with timer("previous approved analysis for day variable"): prev_main = previous_approved_day(prev_main, prev, key + Days_variable) with timer("previous refused analysis for amt variable"): prev_main = previous_refused_amt(prev_main, prev, key + Behaviour_variable) with timer("previous refused analysis for day variable"): prev_main = previous_refused_day(prev_main, prev, key + Days_variable) with timer("previous category variable analysis."): prev_main = previous_category(prev_main, prev, Categorical_variable) with timer("previous last k contract analysis."): prev_main = previous_last_k_contract(prev_main, prev) prev_main.fillna(0, inplace=True) prev_main = correlation_reduce(prev_main) df = pd.merge(df, prev_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') df = reduce_mem_usage(df) return df