def plans_split_feat(train, test, plans, test_plans, queries, test_queries): with timer("plans split main table:"): split_feat = plans_split(plans) split_feat_test = plans_split(test_plans) split_feat['sid'] = plans['sid'] split_feat_test['sid'] = test_plans['sid'] split_main = pd.concat([split_feat, split_feat_test]).reset_index(drop=True) plans_main = pd.concat([plans, test_plans]).reset_index(drop=True) with timer("get diff related feature"): diff_main = get_fe_diff_div(split_main) split_main[diff_main.columns] = diff_main # with timer("get plans time feature"): # time_main = get_datetime(plans_main, 'plan_time') # split_main[time_main.columns] = time_main with timer("get plans differnt type recommend:"): diff_main = get_fe_different_recomend(plans_main) split_main[diff_main.columns] = diff_main # with timer("get_fe_mode_feature:"): # mode_main = get_fe_mode_feature(plans_main) # split_main[mode_main.columns] = mode_main del plans_main gc.collect() train.set_index(['sid'], inplace=True) test.set_index(['sid'], inplace=True) split_main.set_index(['sid'], inplace=True) train = train.join(split_main, how='left') test = test.join(split_main, how='left') train.reset_index(inplace=True) test.reset_index(inplace=True) # train = pd.merge(train, split_main, on = ['sid'], how = 'left') # test = pd.merge(test, split_main, on = ['sid'], how = 'left') return train, test
def bureau_balance_feature(df, Debug=False): if Debug: bureau_balance = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv") bureau_balance = bureau_balance.sample(10000) else: bureau_balance = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv") bureau = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau_all = pd.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], bureau_balance.groupby(['SK_ID_BUREAU', 'STATUS'], as_index=False).count(), how='left', on=['SK_ID_BUREAU']) bureau_main = df[['SK_ID_CURR']] bureau_all['overdue'] = bureau_all.STATUS.apply( lambda x: 1 if x in ['1', '2', '3', '4', '5'] else 0) with timer("bureau balance missing count"): bureau_main = bureau_balance_missing(bureau_main, bureau_all) with timer("bureau balance overdue analysis"): bureau_main = bureau_overdue(bureau_main, bureau_all) with timer("bureau balance status count"): bureau_main = bureau_status(bureau_main, bureau_all) bureau_main.fillna(0, inplace=True) bureau_main = correlation_reduce(bureau_main) df = pd.merge(df, bureau_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') return df
def pos_cash_feature(df, Debug=False): if Debug: pos = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv") pos = pos.sample(10000) else: pos = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv") pos_main = df[['SK_ID_CURR']] pos['SK_DPD_DIFF'] = pos.SK_DPD - pos.SK_DPD_DEF pos['pos_cash_paid_late'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0) pos['pos_cash_paid_late_with_tolerance'] = pos['SK_DPD_DEF'].apply( lambda x: 1 if x > 0 else 0) with timer("pos basic stat analysis"): pos_main = pos_cash_basic(pos_main, pos) with timer("pos cash last record"): pos_main = pos_cash_last(pos_main, pos) with timer("pos cash last k installment analysis"): pos_main = pos_cash_last_k_installment(pos_main, pos) with timer("pos cash last loan analysis"): pos_main = pos_cash_last_loan(pos_main, pos) with timer("pos cash trend analysis"): pos_main = pos_cash_trend_installment(pos_main, pos) pos_main.fillna(0, inplace=True) df = pd.merge(df, pos_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') return df
def bureau_trend(df, bureau): features = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()}) temp = bureau.loc[~(bureau.AMT_CREDIT_SUM.isnull()) & ~(bureau.AMT_CREDIT_SUM_DEBT.isnull()) & (bureau.AMT_CREDIT_SUM != 0) & (bureau.AMT_CREDIT_SUM_DEBT != 0), ['AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']] lr = LinearRegression() lr.fit(temp.AMT_CREDIT_SUM.values.reshape(-1, 1), temp.AMT_CREDIT_SUM_DEBT) bureau['AMT_CREDIT_SUM_DEBT'] = list( map(lambda x, y: lr.predict(x)[0] if np.isnan(y) else y, bureau.AMT_CREDIT_SUM, bureau.AMT_CREDIT_SUM_DEBT)) bureau['AMT_CREDIT_DEBT_RATE'] = list( map(lambda x, y: safe_div(x, y), bureau.AMT_CREDIT_SUM_DEBT, bureau.AMT_CREDIT_SUM)) bureau[ 'AMT_CREDIT_DEBT_REMAIN'] = bureau.AMT_CREDIT_SUM - bureau.AMT_CREDIT_SUM_DEBT temp = bureau.loc[~(bureau.DAYS_CREDIT.isnull()) & ~(bureau.DAYS_CREDIT_ENDDATE.isnull()), ['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE']] temp = temp.groupby(['DAYS_CREDIT' ]).DAYS_CREDIT_ENDDATE.mean().reset_index() lr_days = LinearRegression() lr_days.fit(temp.DAYS_CREDIT.values.reshape(-1, 1), temp.DAYS_CREDIT_ENDDATE.values) bureau['DAYS_CREDIT_ENDDATE'] = list( map(lambda x, y: lr_days.predict(x)[0] if np.isnan(y) else y, bureau.DAYS_CREDIT, bureau.DAYS_CREDIT_ENDDATE)) bureau[ 'CREDIT_DAYS_LONG'] = bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT with timer("trend feature:"): features = bureau_trend_features(features, bureau) with timer("basic stat"): temp = bureau.groupby(['SK_ID_CURR']).agg({ k: ['mean', 'min', 'max', 'median', 'sum', 'std'] for k in [ 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_DEBT_RATE', 'AMT_CREDIT_DEBT_REMAIN', 'DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'CREDIT_DAYS_LONG' ] }) temp.columns = [ "bureau_trend_perspective_" + "_".join(j) for j in temp.columns.ravel() ] temp.reset_index(inplace=True) features = pd.merge(features, temp, on=['SK_ID_CURR'], how='left') features = correlation_reduce(features) df = pd.merge(df, features, on=['SK_ID_CURR'], how='left') return df
def lightgbm_training(train, test, feature, strategy="cv"): if strategy == 'cv': with timer("cross validation strategy:"): train_label = train['click_mode'].astype(int) oof_train, oof_test, oof_test_list, df_imp = kfold_lightgbm( train, train_label, test, feature, 5, stratified=True) return oof_train, oof_test_list, df_imp if strategy == 'last_week': with timer("last week strategy:"): oof_valid, oof_test, feature_importance_df = last_week_lightgbm( train, test, feature) return oof_valid, oof_test, feature_importance_df if strategy == 'both': with timer("corss training - last week strategy:"): oof_train, oof_test, oof_valid, oof_valid_list, oof_test_list, feature_importance_df = \ cv_validation_lightgbm(train, test, feature, 5) return oof_train, oof_valid_list, feature_importance_df
def card_feature(df,Debug = False): if Debug: credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv") credit = credit.sample(10000) else: credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv") card_main = df[['SK_ID_CURR']] key = ['SK_ID_CURR','SK_ID_PREV'] amt = [f for f in credit.columns if 'AMT' in f] cnt = [f for f in credit.columns if 'CNT' in f] sk = ['SK_DPD', 'SK_DPD_DEF'] with timer("card missing analysis"): card_main = card_missing(card_main,credit) with timer("card overdue analysis"): card_main = card_sk(card_main, credit, key + sk) with timer("card using analysis"): card_main = card_using(card_main, credit, key + amt) with timer("card all behavior analysis"): card_main = card_all(card_main, credit, key + amt + cnt) with timer("card using behavior analysis from the first payment of AMT_PAYMENT_TOTAL_CURRENT"): card_main = card_amt_total_payment(card_main, credit, key + amt + cnt) with timer("card last two year behavior analysis"): card_main = card_last_two_year(card_main, credit) with timer("card last one year behavior analysis"): card_main = card_last_one_year(card_main, credit) card_main.fillna(0,inplace = True) card_main = correlation_reduce(card_main) df = pd.merge(df, card_main, on = ['SK_ID_CURR'], how = 'left') return df
def get_fe_plans_counts(plans_split, queries): split = plans_split.copy() data = split[['sid']] split = split.merge(queries, on = ['sid'], how = 'left') # with timer("price part counter:"): # group = price_counter(split, False) # data[group.columns] = group with timer("mode part counter:"): group = mode_counter(split) data[group.columns] = group return data
def bureau_feature(df,Debug = False): if Debug: bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau = bureau.sample(10000) else: bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan bureau['active_flag'] = bureau.CREDIT_ACTIVE.apply(lambda x: 1 if x == 'Active' else 0) bureau['enddate_flag'] = bureau.DAYS_CREDIT_ENDDATE.apply(lambda x: 1 if x > 0 else 0) bureau['overdue_flag'] = bureau.AMT_CREDIT_MAX_OVERDUE.apply(lambda x: 1 if x > 0 else 0) bureau['using_flag'] = bureau.AMT_CREDIT_SUM_DEBT.apply(lambda x: 1 if x > 0 else 0) credit_main = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()}) day = ['DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','DAYS_CREDIT','DAYS_CREDIT_UPDATE'] key = ['SK_ID_CURR','SK_ID_BUREAU'] amt = ['CNT_CREDIT_PROLONG','CREDIT_DAY_OVERDUE','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','AMT_ANNUITY'] cat = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'] with timer("bureau missing count"): credit_main = bureau_missing(credit_main, bureau) print(credit_main.shape) with timer("bureau flag variable analysis"): credit_main = bureau_flag(credit_main, bureau) print(credit_main.shape) with timer("bureau amt debt analysis"): credit_main = bureau_all_debt(credit_main, bureau) print(credit_main.shape) with timer("bureau active status analysis"): credit_main = bureau_active(credit_main, bureau, key + day + amt) print(credit_main.shape) with timer("bureau closed status analysis"): credit_main = bureau_closed(credit_main, bureau, key + day + amt) print(credit_main.shape) credit_main.fillna(0, inplace = True) df = pd.merge(df, credit_main, on = ['SK_ID_CURR'],how = 'left',validate='one_to_one') return df
def feature_engineer(train, test, clicks, tr_q, ts_q, tr_p, ts_p, pro, p1 = False, p2 = True, p3 = True): print("feature_engineer part") if p1: with timer("clicks part:-==========="): train, test = clicks_feat(train, test, clicks, tr_q, ts_q, tr_p, ts_p) if p2: with timer("queries part:==========="): train, test = queries_feat(train, test, tr_p, tr_q, ts_p, ts_q, pro) if p3: with timer("plans part:============="): train, test = plans_split_feat(train, test, tr_p, ts_p, tr_q, ts_q) with timer("merge full dataset:"): train = pd.merge(train, clicks[['sid','click_mode','click_time']], on = ['sid'], how = 'left') train['click_mode'] = train['click_mode'].fillna(0).astype(int) train['click_time'].fillna(train['req_time'], inplace = True) train.fillna(0, inplace = True) test.fillna(0, inplace = True) features = [col for col in train.columns \ if col not in ['sid','o', 'd', 'click_time','click_mode', 'req_time','plan_time','plans']] return train, test, features
def clicks_feat(train, test, clicks, train_queries, test_queries, train_plans, test_plans): train_queries = get_queries_basic(train_queries) test_queries = get_queries_basic(test_queries) data = pd.merge(train_queries, clicks, on=['sid'], how='left') data.click_mode.fillna(0, inplace=True) data.pid.fillna(-1, inplace=True) data['values'] = 1 with timer("time ratio encoding:"): key = ['o_geohash', 'd_geohash'] df = oof_fe_by(data, key, 'cnt') tr = pd.merge(train_queries[['sid'] + key], df, on=key, how='left') ts = pd.merge(test_queries[['sid'] + key], df, on=key, how='left') train = pd.merge(train, tr, on=['sid'], how='left') test = pd.merge(test, ts, on=['sid'], how='left') return train, test
def data_cleaning(tr_q, tr_p, tr_c): data = tr_p.copy() with timer("basic:"): tr_q = get_queries_basic(tr_q) for i in range(1): data['recom_mode_' + str(i)] = data['plans'].apply( lambda x: x[i]['transport_mode'] if len(x) > i else 0) data['recom_price_' + str(i)] = data['plans'].apply( lambda x: x[i]['price'] if len(x) > i else 0) data['recom_eta_' + str(i)] = data['plans'].apply( lambda x: x[i]['eta'] // 60 if len(x) > i else 0) data['recom_distance_' + str(i)] = data['plans'].apply( lambda x: x[i]['distance'] // 1000 if len(x) > i else 0) data['recom_price_' + str(i)].replace("", 0, inplace=True) data['recom_price_' + str(i)] = \ np.where(data['recom_mode_' + str(i)] == 3, -2, data['recom_price_' + str(i)]) data['recom_price_' + str(i)] = \ np.where(data['recom_mode_' + str(i)] == 5, -1, data['recom_price_' + str(i)]) data = data.merge(tr_q, on=['sid'], how='left') data = data.merge(tr_c, on=['sid'], how='left') data.click_mode.fillna(0, inplace=True) id_list = count_list(data) tr_q = tr_q.merge(id_list, on=['sid'], how='left') tr_q.flag.fillna(0, inplace=True) tr_q = tr_q.loc[tr_q.flag == 0].reset_index(drop=True) tr_q.drop(['flag'], axis=1, inplace=True) tr_p = tr_p.merge(id_list, on=['sid'], how='left') tr_p.flag.fillna(0, inplace=True) tr_p = tr_p.loc[tr_p.flag == 0].reset_index(drop=True) tr_p.drop(['flag'], axis=1, inplace=True) tr_c = tr_c.merge(id_list, on=['sid'], how='left') tr_c.flag.fillna(0, inplace=True) tr_c = tr_c.loc[tr_c.flag == 0].reset_index(drop=True) tr_c.drop(['flag'], axis=1, inplace=True) return tr_q, tr_p, tr_c
def install_feature(df, Debug=False): if Debug: installment = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv" ) installment = installment.sample(10000) else: installment = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv" ) ins_main = df[['SK_ID_CURR']] installment['instalment_paid_late_in_days'] = installment[ 'DAYS_ENTRY_PAYMENT'] - installment['DAYS_INSTALMENT'] installment['instalment_paid_late'] = installment[ 'instalment_paid_late_in_days'].apply(lambda x: 1 if x > 0 else 0) installment['instalment_paid_over_amount'] = installment[ 'AMT_PAYMENT'] - installment['AMT_INSTALMENT'] installment['instalment_paid_over'] = installment[ 'instalment_paid_over_amount'].apply(lambda x: 1 if x > 0 else 0) with timer("basic stat analysis"): ins_main = install_basic(ins_main, installment) with timer("advance stat analysis"): ins_main = install_advance(ins_main, installment) with timer("install prelong analysis"): ins_main = install_prelong(ins_main, installment) with timer("last k installment analysis"): ins_main = install_last_k_feature(ins_main, installment) with timer("last k fraction installment analysis"): ins_main = install_last_k_fraction_feature(ins_main, installment) with timer("last k trend installment analysis"): ins_main = install_trend_k_feature(ins_main, installment) ins_main.fillna(0, inplace=True) df = pd.merge(df, ins_main, how='left', on=['SK_ID_CURR'], validate='one_to_one') return df
def queries_feat(train, test, train_plans, train_queries, test_plans, test_queries, profiles): with timer("basic:"): train_num = train.shape[0] df = pd.concat([train[['sid']], test[['sid']]]).reset_index(drop=True) plans = pd.concat([train_plans, test_plans]).reset_index(drop=True) plans = get_plans_basic(plans) queries = pd.concat([train_queries, test_queries]).reset_index(drop=True) queries = get_queries_basic(queries) with timer("get request time feature"): time_feature = get_datetime(queries, 'req_time') df[time_feature.columns] = time_feature queries[time_feature.columns] = time_feature queries.pid.fillna(-1, inplace=True) del time_feature gc.collect() with timer("profiles feature:"): pro_feat = get_fe_profiles(profiles) pro_feat = pd.merge(queries[['sid', 'pid']], pro_feat, on=['pid'], how='left') queries = pd.merge(queries, pro_feat, on=['sid', 'pid'], how='left') df = pd.merge(df, pro_feat, on=['sid'], how='left') del pro_feat gc.collect() with timer("time diff feature:"): time_feat = time_diff_feat(queries, plans) queries[time_feat.columns] = time_feat df[time_feat.columns] = time_feat del time_feat gc.collect() with timer("distance feature:"): distance_feat = get_fe_different_dist(queries) queries[distance_feat.columns] = distance_feat df[distance_feat.columns] = distance_feat del distance_feat gc.collect() # queries_new = pd.get_dummies(queries['city'], columns= ['city'], dummy_na= False) # queries[queries_new.columns] = queries_new # df[queries_new.columns] = queries_new data = pd.merge(queries, plans, on=['sid'], how='left') del plans gc.collect() queries = queries[[ 'sid', 'o_x', 'o_y', 'd_x', 'd_y', 'o_count_totle', 'd_count_totle', 'city' ]] df['recom_distance_haversine'] = data['recom_distance_0'] / ( data['haversine'] + 0.01) df['recom_distance_manhanttan'] = data['recom_distance_0'] / ( data['manhattan'] + 0.01) df['recom_distance_bearing_array'] = data['recom_distance_0'] / ( data['bearing_array'] + 0.01) data['recom_distance_haversine'] = data['recom_distance_0'] / ( data['haversine'] + 0.01) data['recom_distance_manhanttan'] = data['recom_distance_0'] / ( data['manhattan'] + 0.01) data['recom_distance_bearing_array'] = data['recom_distance_0'] / ( data['bearing_array'] + 0.01) with timer("get_fe_first_recom_stat feature:"): pid_stat = get_fe_first_recom_stat(data) df[pid_stat.columns] = pid_stat with timer("get_fe_queries_cross_count count:"): cross_count = get_fe_queries_cross_count(data) df[cross_count.columns] = cross_count with timer("get_fe_queries_nunique nunique:"): nunique = get_fe_queries_nunique(data) df[nunique.columns] = nunique with timer("get_fe_location_mode_count mode count:"): mode_loc = get_fe_loc_mode_count(data) df[mode_loc.columns] = mode_loc # with timer("get_fe_each_mode_count:"): # mode_count = get_fe_each_mode_count(data) # df[mode_count.columns] = mode_count # with timer("geo time encoding feature:"): # geo_feat = get_fe_encode_geo_time(queries) # df[geo_feat.columns] = geo_feat # del geo_feat # gc.collect() # with timer("get_fe_plans_num_counts:"): # num = get_fe_plans_num_counts(data) # df[num.columns] = num del data gc.collect() basic_feat = [ 'sid', 'o_x', 'o_y', 'd_x', 'd_y', 'o_count_totle', 'd_count_totle', 'city' ] df = pd.merge(df, queries[basic_feat], on=['sid'], how='left') df.drop(['sid'], axis=1, inplace=True) train[df.columns] = df[:train_num].reset_index(drop=True) test[df.columns] = df[train_num:].reset_index(drop=True) return train, test
def previous_feature(df, Debug=False): if Debug: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev = prev.sample(10000) else: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev_main = df[['SK_ID_CURR']] key = ['SK_ID_CURR', 'SK_ID_PREV'] Behaviour_variable = [ 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'CNT_PAYMENT' ] Days_variable = [ 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'DAYS_DECISION' ] Categorical_variable = [ 'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION' ] prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) with timer("previous application missiong count analysis."): prev_main = previous_missing(prev_main, prev, Behaviour_variable + Days_variable) with timer("previous all record analysis for amt variable."): prev_main = previous_all_stat_amt(prev_main, prev, key + Behaviour_variable) with timer("previous all record analysis for day variable."): prev_main = previous_all_stat_day(prev_main, prev, key + Days_variable) with timer("previous approved analysis for amt variable"): prev_main = previous_approved_amt(prev_main, prev, key + Behaviour_variable) with timer("previous approved analysis for day variable"): prev_main = previous_approved_day(prev_main, prev, key + Days_variable) with timer("previous refused analysis for amt variable"): prev_main = previous_refused_amt(prev_main, prev, key + Behaviour_variable) with timer("previous refused analysis for day variable"): prev_main = previous_refused_day(prev_main, prev, key + Days_variable) with timer("previous category variable analysis."): prev_main = previous_category(prev_main, prev, Categorical_variable) with timer("previous last k contract analysis."): prev_main = previous_last_k_contract(prev_main, prev) prev_main.fillna(0, inplace=True) prev_main = correlation_reduce(prev_main) df = pd.merge(df, prev_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') df = reduce_mem_usage(df) return df