Ejemplo n.º 1
0
def previous_refused_amt(df, origin, feat):
    temp = origin.loc[origin.NAME_CONTRACT_STATUS == 'Refused',feat].astype(np.float64)
    temp['amt_min_year'] = temp.AMT_CREDIT/(1 + temp.AMT_ANNUITY)
    temp['amt_credit_trust'] = temp.AMT_CREDIT - temp.AMT_APPLICATION
    temp['amt_credit_ratio'] = temp.AMT_CREDIT/(1 + temp.AMT_APPLICATION)
    temp['amt_goods_ratio'] = temp.AMT_CREDIT/(1 + temp.AMT_GOODS_PRICE)
    temp['amt_goods_remain'] = temp.AMT_CREDIT - temp.AMT_GOODS_PRICE
    temp = temp.groupby(['SK_ID_CURR']).agg({
    k:['min','median','max','mean','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV']})
    temp.columns = ['prev_Refused_' + "_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace = True)
    temp = correlation_reduce(temp)
    df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left')
    return df
Ejemplo n.º 2
0
def install_prelong(df, origin):
    temp = origin.sort_values(['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']).drop_duplicates(
    ['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_VERSION'])
    temp['pay_long'] = temp[['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']].groupby(['SK_ID_CURR','SK_ID_PREV']).shift(-1)
    temp.pay_long = temp.pay_long - temp.NUM_INSTALMENT_NUMBER
    temp.pay_long.fillna(1, inplace = True)
    temp['pay_long_amount'] = temp.AMT_PAYMENT * temp.pay_long
    temp1 = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg(
    {k:['mean','sum','min','median','max','std'] for k in ['pay_long','pay_long_amount']})
    temp1.columns = ["ins" + "_" + col1 + "_" + col2 for col1 in temp1.columns.levels[0] for col2 in temp1.columns.levels[1]]
    temp1.reset_index(inplace = True)
    temp1 = correlation_reduce(temp1)
    df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left')
    return df
Ejemplo n.º 3
0
def bureau_overdue(df, origin):
    temp = origin.groupby(['SK_ID_CURR',
                           'SK_ID_BUREAU']).agg({"overdue": ['mean', 'sum']})
    temp.columns = ["_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace=True)
    temp = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg({
        k: ['mean', 'max', 'std', 'sum', 'median']
        for k in temp.columns if k not in ['SK_ID_CURR', 'SK_ID_BUREAU']
    })
    temp.columns = ['bureau_' + "_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace=True)
    temp = correlation_reduce(temp)
    df = pd.merge(df, temp, on=['SK_ID_CURR'], how='left')
    return df
def bureau_trend(df, bureau):
    features = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()})
    temp = bureau.loc[~(bureau.AMT_CREDIT_SUM.isnull())
                      & ~(bureau.AMT_CREDIT_SUM_DEBT.isnull()) &
                      (bureau.AMT_CREDIT_SUM != 0) &
                      (bureau.AMT_CREDIT_SUM_DEBT != 0),
                      ['AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']]
    lr = LinearRegression()
    lr.fit(temp.AMT_CREDIT_SUM.values.reshape(-1, 1), temp.AMT_CREDIT_SUM_DEBT)
    bureau['AMT_CREDIT_SUM_DEBT'] = list(
        map(lambda x, y: lr.predict(x)[0] if np.isnan(y) else y,
            bureau.AMT_CREDIT_SUM, bureau.AMT_CREDIT_SUM_DEBT))
    bureau['AMT_CREDIT_DEBT_RATE'] = list(
        map(lambda x, y: safe_div(x, y), bureau.AMT_CREDIT_SUM_DEBT,
            bureau.AMT_CREDIT_SUM))
    bureau[
        'AMT_CREDIT_DEBT_REMAIN'] = bureau.AMT_CREDIT_SUM - bureau.AMT_CREDIT_SUM_DEBT
    temp = bureau.loc[~(bureau.DAYS_CREDIT.isnull())
                      & ~(bureau.DAYS_CREDIT_ENDDATE.isnull()),
                      ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE']]
    temp = temp.groupby(['DAYS_CREDIT'
                         ]).DAYS_CREDIT_ENDDATE.mean().reset_index()
    lr_days = LinearRegression()
    lr_days.fit(temp.DAYS_CREDIT.values.reshape(-1, 1),
                temp.DAYS_CREDIT_ENDDATE.values)
    bureau['DAYS_CREDIT_ENDDATE'] = list(
        map(lambda x, y: lr_days.predict(x)[0] if np.isnan(y) else y,
            bureau.DAYS_CREDIT, bureau.DAYS_CREDIT_ENDDATE))
    bureau[
        'CREDIT_DAYS_LONG'] = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT
    with timer("trend feature:"):
        features = bureau_trend_features(features, bureau)
    with timer("basic stat"):
        temp = bureau.groupby(['SK_ID_CURR']).agg({
            k: ['mean', 'min', 'max', 'median', 'sum', 'std']
            for k in [
                'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT',
                'AMT_CREDIT_DEBT_RATE', 'AMT_CREDIT_DEBT_REMAIN',
                'DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'CREDIT_DAYS_LONG'
            ]
        })
        temp.columns = [
            "bureau_trend_perspective_" + "_".join(j)
            for j in temp.columns.ravel()
        ]
        temp.reset_index(inplace=True)
        features = pd.merge(features, temp, on=['SK_ID_CURR'], how='left')
    features = correlation_reduce(features)
    df = pd.merge(df, features, on=['SK_ID_CURR'], how='left')
    return df
def card_amt_total_payment(df, origin, feat):
    temp = origin.loc[origin.AMT_PAYMENT_TOTAL_CURRENT != 0, feat]
    temp['AMT_RECEIVABLE_DIFF'] = temp.AMT_RECIVABLE - temp.AMT_RECEIVABLE_PRINCIPAL
    temp['AMT_BALANCE_DIFF'] = temp.AMT_CREDIT_LIMIT_ACTUAL - temp.AMT_BALANCE
    temp['AMT_DRAWINGS_DIFF'] = temp.AMT_CREDIT_LIMIT_ACTUAL - temp.AMT_DRAWINGS_CURRENT
    #ratio variable
    temp['AMT_CREDIT_RATIO'] = temp.AMT_DRAWINGS_CURRENT/(1 + temp.AMT_CREDIT_LIMIT_ACTUAL)
    temp['AMT_BALANCE_RATIO'] = temp.AMT_BALANCE/(1 + temp.AMT_CREDIT_LIMIT_ACTUAL)
    temp['AMT_RECEIVABLE_RATIO'] = temp.AMT_RECIVABLE/(1 + temp.AMT_BALANCE)
    temp = temp.groupby(['SK_ID_CURR','SK_ID_PREV']).agg({
    k:['mean','min','max','median','sum','std'] for k in temp.columns
    if k not in ['SK_ID_CURR','SK_ID_PREV']})
    temp = correlation_reduce(temp)
    temp.columns = ["_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace = True)
    temp = temp.groupby(['SK_ID_CURR']).agg({
    k:['mean','min','max','median','sum','std'] for k in temp.columns
    if k not in ['SK_ID_CURR','SK_ID_PREV']})
    temp.columns = ["card_using_" + "_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace = True)
    temp = correlation_reduce(temp)
    df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left')
    return df
def install_advance(df, origin):
    temp1 = origin.groupby(['SK_ID_CURR']).agg(
    {k:['sum','mean','median','min','max','std',iqr,skew,kurtosis]
    for k in ['instalment_paid_late_in_days','instalment_paid_over_amount']})
    temp2 = origin.groupby(['SK_ID_CURR']).agg(
    {k:['sum','mean','count'] for k in ['instalment_paid_late','instalment_paid_over']}
    )
    temp1.columns = ["_".join(j) for j in temp1.columns.ravel()]
    temp2.columns = ["_".join(j) for j in temp2.columns.ravel()]
    temp2 = correlation_reduce(temp2)
    temp1.reset_index(inplace = True)
    temp2.reset_index(inplace = True)
    df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left')
    df = pd.merge(df, temp2, on = ['SK_ID_CURR'], how = 'left')
    return df
Ejemplo n.º 7
0
def previous_refused_day(df, origin, feat):
    temp = origin.loc[origin.NAME_CONTRACT_STATUS == 'Refused', feat].astype(np.float64)
    temp['days_due_diff'] = temp.DAYS_LAST_DUE - temp.DAYS_FIRST_DUE
    temp['days_last_due_diff'] = temp.DAYS_LAST_DUE_1ST_VERSION - temp.DAYS_LAST_DUE
    temp['days_waiting_diff'] = temp.DAYS_FIRST_DUE - temp.DAYS_DECISION
    temp['days_origin_due_diff'] = temp.DAYS_LAST_DUE_1ST_VERSION - temp.DAYS_FIRST_DUE
    temp['days_origin_que_diff'] = temp.DAYS_TERMINATION - temp.DAYS_LAST_DUE
    temp = temp.groupby(['SK_ID_CURR']).agg({
    k:['min','median','max','mean','std'] for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV']
    })
    temp.columns = ['prev_Refused_' + "_".join(j) for j in temp.columns.ravel()]
    temp.reset_index(inplace = True)
    temp = correlation_reduce(temp)
    df = pd.merge(df, temp, on = ['SK_ID_CURR'], how = 'left')
    return df
Ejemplo n.º 8
0
def bureau_closed(df, origin, feat):
    temp = origin.loc[origin.CREDIT_ACTIVE != 'Active', feat]
    temp.fillna(0, inplace=True)
    temp['AMT_CREDIT_DEBT_RATE'] = temp.AMT_CREDIT_SUM_DEBT / (
        1 + temp.AMT_CREDIT_SUM)
    temp = temp.astype(np.float64).groupby(['SK_ID_CURR']).agg({
        k: ['min', 'median', 'max', 'mean', 'sum', 'std']
        for k in temp.columns
        if k not in ['DAYS_CREDIT_UPDATE'] + ['SK_ID_CURR', 'SK_ID_BUREAU']
    })
    temp.columns = [
        "credit_closed_" + "_".join(j) for j in temp.columns.ravel()
    ]
    temp.reset_index(inplace=True)
    temp = correlation_reduce(temp)
    df = pd.merge(df, temp, on=['SK_ID_CURR'], how='left')
    return df
def previous_last_k_contract(df, previous_application):
    numbers_of_applications = [1, 3, 5]
    features = pd.DataFrame(
        {'SK_ID_CURR': previous_application['SK_ID_CURR'].unique()})
    prev_applications_sorted = previous_application.sort_values(
        ['SK_ID_CURR', 'DAYS_DECISION'])
    for number in numbers_of_applications:
        prev_applications_tail = prev_applications_sorted.groupby(
            by=['SK_ID_CURR']).tail(number)
        group_object = prev_applications_tail.groupby(
            by=['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index()
        group_object.rename(
            index=str,
            columns={
                'CNT_PAYMENT':
                'previous_application_term_of_last_{}_credits_mean'.format(
                    number)
            },
            inplace=True)
        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
        group_object = prev_applications_tail.groupby(
            by=['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index()
        group_object.rename(
            index=str,
            columns={
                'DAYS_DECISION':
                'previous_application_days_decision_about_last_{}_credits_mean'
                .format(number)
            },
            inplace=True)
        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
        group_object = prev_applications_tail.groupby(
            by=['SK_ID_CURR'])['DAYS_FIRST_DRAWING'].mean().reset_index()
        group_object.rename(
            index=str,
            columns={
                'DAYS_FIRST_DRAWING':
                'previous_application_days_first_drawing_last_{}_credits_mean'.
                format(number)
            },
            inplace=True)
        features = features.merge(group_object, on=['SK_ID_CURR'], how='left')
    features = correlation_reduce(features)
    features.fillna(0, inplace=True)
    df = pd.merge(df, features, on=['SK_ID_CURR'], how='left')
    return df
def card_last_two_year(df, origin):
    temp = origin.loc[(origin.MONTHS_BALANCE >= -24) & (origin.NAME_CONTRACT_STATUS == 'Active'),
    ['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE','AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL','AMT_DRAWINGS_CURRENT',
    'AMT_PAYMENT_CURRENT','AMT_RECEIVABLE_PRINCIPAL',
    'CNT_DRAWINGS_CURRENT','SK_DPD','SK_DPD_DEF']].sort_values(['SK_ID_CURR','MONTHS_BALANCE'])
    temp['amt_balance_ratio'] = temp.AMT_BALANCE/(temp.AMT_CREDIT_LIMIT_ACTUAL + 1)
    temp1 = temp.astype(np.float64).groupby(['SK_ID_CURR','SK_ID_PREV']).agg(
    {k:['min','median','max','mean','std','sum']
    for k in temp.columns if k not in ['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE']})
    temp1.columns = ["_".join(j) for j in temp1.columns.ravel()]
    temp1.reset_index(inplace = True)
    temp1 = temp1.groupby(['SK_ID_CURR']).agg({
    k:['min','max','mean','median','std','sum']
    for k in temp1.columns if k not in ['SK_ID_CURR','SK_ID_PREV']})
    temp1.columns = ["card_2_year_" + "_".join(j) for j in temp1.columns.ravel()]
    temp1 = correlation_reduce(temp1)
    temp1.reset_index(inplace = True)
    df = pd.merge(df, temp1, on = ['SK_ID_CURR'], how = 'left')
    return df
Ejemplo n.º 11
0
def previous_feature(df, Debug=False):
    if Debug:
        prev = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv"
        )
        prev = prev.sample(10000)
    else:
        prev = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv"
        )
    prev_main = df[['SK_ID_CURR']]
    key = ['SK_ID_CURR', 'SK_ID_PREV']
    Behaviour_variable = [
        'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT',
        'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'CNT_PAYMENT'
    ]
    Days_variable = [
        'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE',
        'DAYS_TERMINATION', 'DAYS_DECISION'
    ]
    Categorical_variable = [
        'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
        'HOUR_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE',
        'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
        'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
        'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
        'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION'
    ]
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    with timer("previous application missiong count analysis."):
        prev_main = previous_missing(prev_main, prev,
                                     Behaviour_variable + Days_variable)
    with timer("previous all record analysis for amt variable."):
        prev_main = previous_all_stat_amt(prev_main, prev,
                                          key + Behaviour_variable)
    with timer("previous all record analysis for day variable."):
        prev_main = previous_all_stat_day(prev_main, prev, key + Days_variable)
    with timer("previous approved analysis for amt variable"):
        prev_main = previous_approved_amt(prev_main, prev,
                                          key + Behaviour_variable)
    with timer("previous approved analysis for day variable"):
        prev_main = previous_approved_day(prev_main, prev, key + Days_variable)
    with timer("previous refused analysis for amt variable"):
        prev_main = previous_refused_amt(prev_main, prev,
                                         key + Behaviour_variable)
    with timer("previous refused analysis for day variable"):
        prev_main = previous_refused_day(prev_main, prev, key + Days_variable)
    with timer("previous category variable analysis."):
        prev_main = previous_category(prev_main, prev, Categorical_variable)
    with timer("previous last k contract analysis."):
        prev_main = previous_last_k_contract(prev_main, prev)
    prev_main.fillna(0, inplace=True)
    prev_main = correlation_reduce(prev_main)
    df = pd.merge(df,
                  prev_main,
                  on=['SK_ID_CURR'],
                  how='left',
                  validate='one_to_one')
    df = reduce_mem_usage(df)
    return df