import gc import pickle import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage from kaggle_learn.feature_engineering.statistics import * from sklearn.preprocessing import OneHotEncoder, LabelEncoder from instacart_utils import * %matplotlib inline with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') products = pd.read_csv('products.csv') prior = pd.read_csv('prior.csv') print(prior.shape) with timer('Reduce memory usage'): memory_usage() prior = reduce_memory_usage(prior) memory_usage() with timer('Simple features'): prior = add_group_mean(prior, cols=['order_id'], cname='order_product_reordered_mean', value='reordered')
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Read data'): previous_application = pd.read_csv('previous_application.csv') print(previous_application.shape) with timer('Preprocessing'): previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) with timer('Simple features'):
import numpy as np import pandas as pd from kaggle_learn.utils import timer from sklearn.decomposition import PCA with timer('Load data'): hist = pd.read_csv('hist_transac_processed.csv') print('historical transaction data: {}'.format(hist.shape)) hist = hist.loc[hist['authorized_flag'] == 1] print('historical transaction data (approved): {}'.format(hist.shape)) with timer('Get feature dataframe base'): hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index() hist_feats.columns = ['card_id', 'hist_transac_a_count'] with timer('Transform purchase amount'): hist['purchase_amount'] = np.round(hist['purchase_amount'] / 0.00150265118 + 497.06, 2) with timer('Transaction amount features'): for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']: hist_feats['hist_transac_a_amount_{}'.format(m)] = hist.groupby(['card_id'])['purchase_amount'].agg([m]).values hist_feats['hist_transac_a_amount_diff'] = hist_feats['hist_transac_a_amount_max'].values - hist_feats['hist_transac_a_amount_min'].values # refer from https://www.kaggle.com/fabiendaniel/elo-world?scriptVersionId=8335387 with timer('Transaction amount (monthly) features'): grp_1 = hist.groupby(['card_id', 'month_lag']) agg_func = {'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std', 'skew']} grp_2 = grp_1.agg(agg_func) grp_2.columns = ['_'.join(c).strip() for c in grp_2.columns.values]
import numpy as np import pandas as pd import matplotlib.pyplot as plt import lightgbm as lgb from tqdm import tqdm from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage from kaggle_learn.feature_engineering.statistics import * from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.model_selection import KFold, train_test_split from instacart_utils import * %matplotlib inline with timer('Load data'): train = pd.read_csv('train_processed_0930.csv') test = pd.read_csv('test_processed_0930.csv') print(train.shape, test.shape) with timer('Prepare features'): features = test.columns.tolist() features.remove('order_id') features.remove('user_id') features.remove('product_id') features.remove('order_number') features.remove('user_is_not_1st_order_count') categorical_features = ['order_dow', 'order_hour_of_day'] print('Number of features = {}'.format(len(features)))
import os import gc import pickle import time import numpy as np import pandas as pd from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') order_products_prior = pd.read_csv('order_products__prior.csv') order_products_train = pd.read_csv('order_products__train.csv') orders = pd.read_csv('orders.csv') products = pd.read_csv('products.csv') with timer('Get prior/train/test data'): sub = pd.read_csv('sample_submission.csv') prior = orders.loc[orders['eval_set'] == 'prior'] train = orders.loc[orders['eval_set'] == 'train'] test = orders.loc[orders['eval_set'] == 'test'] print(prior.shape, train.shape, test.shape, sub.shape) with timer('Process prior/train/test'): prior = order_products_prior.merge(prior, on=['order_id'], how='left') # get the train users all product history, later a binary classification will be applied on it than we can convert to the submission format train_user_product_history = prior.loc[prior['user_id'].isin(train['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first')
gp_1 = df.groupby(full_group)[value].count().reset_index() gp_1.columns = full_group + ['subgroup_cnt'] gp_2 = df.groupby(group)[value].count().reset_index() gp_2.columns = [group, 'cnt'] gp_3 = gp_2.merge(gp_1, on=group, how='left') gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] / gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt'] gp_3['entropy'].fillna(0, inplace=True) gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index() gp_4.columns = [group, cname] df_feats = df_feats.merge(gp_4, on=group, how='left') return df_feats with timer('Load data'): hist = pd.read_csv('hist_transac_processed.csv') print('historical transaction data: {}'.format(hist.shape)) hist = hist.loc[hist['authorized_flag'] == 1] print('historical transaction data (approved): {}'.format(hist.shape)) with timer('Get feature dataframe base'): hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index() hist_feats.columns = ['card_id', 'hist_transac_a_count'] with timer('Basic transaction info features'): for c in ['city', 'state', 'merchant_category', 'subsector', 'merchant']: hist_feats['hist_transac_a_{}_nunique'.format(c)] = hist.groupby(['card_id'])['{}_id'.format(c)].nunique().values hist_feats['hist_transac_a_category_1_1_count'] = hist.groupby(['card_id'])['category_1'].sum().values hist_feats['hist_transac_a_category_1_0_count'] = hist_feats['hist_transac_a_count'].values - hist_feats['hist_transac_a_category_1_1_count'].values
import pandas as pd from sklearn.preprocessing import LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Load data'): new = pd.read_csv('new_merchant_transactions.csv') with timer('Get reference date (month)'): new['purchase_month'] = new['purchase_date'].astype(str).apply( lambda x: x[:10]) new['reference_month'] = pd.to_datetime( new['purchase_month']) - new['month_lag'].apply( lambda x: np.timedelta64(x, 'M')) new['reference_month'] = new['reference_month'].astype(str).apply( lambda x: x[:7]) new.drop(['purchase_month'], axis=1, inplace=True) with timer('Convert categorical to int for new transactions'): cols = ['authorized_flag', 'category_1', 'category_3'] lbl_encoder = LabelEncoder() for c in cols: new[c] = lbl_encoder.fit_transform(new[c].astype(str)) with timer('Generate simple / intermediate features'): new['month_lag=1'] = (new['month_lag'] == 1).astype(int) new['month_lag=2'] = (new['month_lag'] == 2).astype(int) new['category_2=1'] = (new['category_2'] == 1.).astype(int) new['category_2=2'] = (new['category_2'] == 2.).astype(int) new['category_2=3'] = (new['category_2'] == 3.).astype(int)
import gc import pickle import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage from kaggle_learn.feature_engineering.statistics import * from sklearn.preprocessing import OneHotEncoder, LabelEncoder from instacart_utils import * %matplotlib inline with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') products = pd.read_csv('products.csv') prior = pd.read_csv('prior.csv') print(prior.shape) with timer('Reduce memory usage'): memory_usage() prior = reduce_memory_usage(prior) memory_usage() with timer('User features'): # number of user orders
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Read data'): bureau = pd.read_csv('bureau.csv') bureau_balance = pd.read_csv('bureau_balance.csv') print(bureau.shape, bureau_balance.shape) with timer('Join bureau_balance to bureau'): gp = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].count().reset_index() gp.columns = ['SK_ID_BUREAU', 'MONTHS_BALANCE_COUNT'] # MONTHS_BALANCE_COUNT: for each SK_ID_CURR, it corresponding to multiple SK_ID_BUREAU # each SK_ID_BUREAU has a series of MONTHS_BALANCE bureau = bureau.merge(gp, on=['SK_ID_BUREAU'], how='left') del gp; gc.collect() gp = add_group_value_count(bureau_balance, cols=['SK_ID_BUREAU', 'STATUS'], value='MONTHS_BALANCE', prefix='BUB_')\ .drop(['MONTHS_BALANCE', 'STATUS'], axis=1)\ .drop_duplicates(keep='first') bureau = bureau.merge(gp, on=['SK_ID_BUREAU'], how='left') del gp; gc.collect()
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Load train and test data'): train = pd.read_csv('application_train.csv') test = pd.read_csv('application_test.csv') ntrain = train.shape[0] id_col = 'SK_ID_CURR' target = 'TARGET' df_full = pd.concat([train, test]) print(train.shape, test.shape) with timer('Preprocessing'): df_full['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) df_full['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True) df_full = df_full.loc[df_full['CODE_GENDER'] != 'XNA']; ntrain = ntrain - 4 with timer('Get categorical features') features_category = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
# ============================================ # Run a Ridge to get OOF (out-of-fold) # ============================================ with open('features.pkl', 'rb') as f: features = pickle.load(f) with open('tfidf.pkl', 'rb') as f: df_text_processed = pickle.load(f) with open('df_reduced.pkl', 'rb') as f: df_reduced = pickle.load(f) with timer('Load data'): train = pd.read_csv('train.csv.zip', parse_dates=["activation_date"]) test = pd.read_csv('test.csv.zip', parse_dates=["activation_date"]) ntrain = train.shape[0] ntest = test.shape[0] del train, test gc.collect() with timer('Training ridge oof preds'): y_train_all = df_reduced['deal_probability'].iloc[:ntrain] if os.path.exists('ridge_preds.csv'): ridge_preds = pd.read_csv('ridge_preds.csv') df_reduced['ridge_preds'] = ridge_preds['ridge_preds'].values del ridge_preds gc.collect()
'nthread': 4 } lgb_regressor = lgb.train(params=lgb_params, train_set=X_train_lgb, num_boost_round=1000) imp_df = pd.DataFrame() imp_df['feature'] = list(X_train.columns) imp_df['importance_gain'] = lgb_regressor.feature_importance( importance_type='gain') imp_df['importance_split'] = lgb_regressor.feature_importance( importance_type='split') return imp_df with timer('Load data'): train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') hist_transac_info = pd.read_csv('hist_transac_info.csv') hist_transac_amount = pd.read_csv('hist_transac_amount.csv') hist_transac_time = pd.read_csv('hist_transac_time.csv') hist_transac_info_a = pd.read_csv('hist_transac_info_a.csv') hist_transac_amount_a = pd.read_csv('hist_transac_amount_a.csv') hist_transac_time_a = pd.read_csv('hist_transac_time_a.csv') hist_transac_merchant_lda_comp = pd.read_csv( 'hist_transac_merchant_category_lda_comp_0.csv') hist_transac_merchant_lda_comp_2 = pd.read_csv( 'hist_transac_merchant_category_lda_comp_2.csv') hist_transac_merchantid_lda_comp = pd.read_csv( 'hist_transac_merchant_id_lda_comp_0_1.csv') print(hist_transac_info.shape, hist_transac_time.shape,
with open('vocab.pkl', 'rb') as f: vocab = pickle.load(f) with open('tfidf.pkl', 'rb') as f: df_text_processed = pickle.load(f) with open('features.pkl', 'rb') as f: features = pickle.load(f) with open('categorical_features.pkl', 'rb') as f: categorical_features = pickle.load(f) with open('df_reduced.pkl', 'rb') as f: df_reduced = pickle.load(f) with timer('Prepare for lightgbm'): X_train_all_dense = csr_matrix(df_reduced[features].iloc[:ntrain].values) X_train_all = hstack([X_train_all_dense, df_text_processed[:ntrain]]) del X_train_all_dense X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=.1, random_state=42) X_test = hstack([ df_reduced[features].iloc[ntrain:].values, df_text_processed[ntrain:] ]) all_features = features + vocab
import numpy as np import pandas as pd from kaggle_learn.utils import timer with timer('Load data'): hist = pd.read_csv('hist_transac_processed.csv', usecols=['card_id', 'merchant_id']) print('historical transaction data: {}'.format(hist.shape)) with timer('Merchant customer count'): hist_merchant = hist.groupby(['merchant_id']).size().reset_index() hist_merchant.columns = ['merchant_id', 'merchant_customer_count'] print(hist_merchant.shape) with timer('Merchant repurchase customer count'): hist_merchant_card = hist.groupby(['merchant_id', 'card_id']).size().reset_index() hist_merchant_card.columns = ['merchant_id', 'card_id', 'customer_visit_count'] print(hist_merchant_card.shape) hist_merchant_card = hist_merchant_card.loc[hist_merchant_card['customer_visit_count'] > 1] print(hist_merchant_card.shape) # binary count hist_merchant_repurchase_binary = hist_merchant_card.groupby(['merchant_id']).size().reset_index() hist_merchant_repurchase_binary.columns = ['merchant_id', 'revisited_customers'] hist_merchant_repurchase_binary['revisited_customers'].fillna(0.0, inplace=True) print(hist_merchant_repurchase_binary.shape) print(hist_merchant_repurchase_binary.head()) # exact count hist_merchant_repurchase_exact = hist_merchant_card.groupby(['merchant_id'])['customer_visit_count'].sum().reset_index() hist_merchant_repurchase_exact.columns = ['merchant_id', 'revisited_count']
import gc import pickle import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage from kaggle_learn.feature_engineering.statistics import * from sklearn.preprocessing import OneHotEncoder, LabelEncoder from instacart_utils import * %matplotlib inline with timer('Load data'): product_features = pd.read_csv('product_features.csv') product_w2v_features = pd.read_csv('product_w2v_features.csv') user_features = pd.read_csv('user_features.csv') user_product_features = pd.read_csv('user_x_product_features.csv') train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') print(train.shape, test.shape) with timer('Reduce memory usage'): memory_usage() product_features = reduce_memory_usage(product_features) product_w2v_features = reduce_memory_usage(product_w2v_features) user_features = reduce_memory_usage(user_features) user_product_features = reduce_memory_usage(user_product_features)
import numpy as np import pandas as pd from kaggle_learn.utils import timer with timer('Load data'): new = pd.read_csv('new_transac_processed.csv') print('new transaction data: {}'.format(new.shape)) with timer('Get feature dataframe base'): new_feats = pd.DataFrame(new.groupby(['card_id']).size()).reset_index() new_feats.columns = ['card_id', 'new_transac_count'] with timer('Transform purchase amount'): new['purchase_amount'] = np.round( new['purchase_amount'] / 0.00150265118 + 497.06, 2) with timer('Transaction amount features'): for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']: new_feats['new_transac_amount_{}'.format(m)] = new.groupby( ['card_id'])['purchase_amount'].agg([m]).values new_feats['new_transac_amount_diff'] = new_feats[ 'new_transac_amount_max'].values - new_feats[ 'new_transac_amount_min'].values with timer('Transaction (time related) features'): new_monthsum_amount = new.groupby( ['card_id', 'month_lag'])['purchase_amount'].sum().unstack().reset_index() new_feats[ 'new_transac_monthlag_last_1_amount'] = new_monthsum_amount.iloc[:,
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split, KFold from kaggle_learn.utils import timer with open("df_full.pkl", "rb") as f: df_full = pickle.load(f) with open("features_category.pkl", "wb") as f: features_category = pickle.load(f) with open("features.pkl", "wb") as f: features = pickle.load(f) ntrain = 307507 with timer('Prepare / Train for XGBoost'): features_numeric = list(set(features) - set(features_category)) oh_encoder = OneHotEncoder() sparse_df = oh_encoder.fit_transform(df_full[features_category].values) sparse_df = csr_matrix( hstack((sparse_df, df_full[features_numeric].replace([np.inf, -np.inf], np.nan)))) sparse_train_all = sparse_df[:ntrain] sparse_test = sparse_df[ntrain:] del sparse_df gc.collect() with timer('Prepare / Train for XGBoost'): print('df_full shape = {}'.format(df_full.shape)) fold = KFold(n_splits=5, shuffle=True, random_state=42)
import pandas as pd from sklearn.preprocessing import LabelEncoder from kaggle_learn.utils import timer, reduce_memory_usage from kaggle_learn.feature_engineering.statistics import * with timer('Load data'): hist = pd.read_csv('historical_transactions.csv') with timer('Reduce memory usage of historical transaction'): hist = reduce_memory_usage(hist) with timer('Get reference date (month)'): hist['purchase_month'] = hist['purchase_date'].astype(str).apply( lambda x: x[:7] + '-28') hist['reference_month'] = pd.to_datetime( hist['purchase_month']) - hist['month_lag'].apply( lambda x: np.timedelta64(x, 'M')) hist['reference_month'] = hist['reference_month'].astype(str).apply( lambda x: x[:7]) hist.drop(['purchase_month'], axis=1, inplace=True) with timer('Convert categorical to int for historical transactions'): cols = ['authorized_flag', 'category_1', 'category_3'] lbl_encoder = LabelEncoder() for c in cols: hist[c] = lbl_encoder.fit_transform(hist[c].astype(str)) with timer('Generate simple / intermediate features'): hist['month_lag=0'] = (hist['month_lag'] == 0).astype(int) hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int) hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Read data'): credit_card_balance = pd.read_csv('credit_card_balance.csv') print(credit_card_balance.shape) with timer('Simple features'): credit_card_balance['CB_BALANCE_LIMIT_RATIO'] = credit_card_balance['AMT_BALANCE'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] credit_card_balance['CB_ATM_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_BALANCE'] credit_card_balance['CB_ATM_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] credit_card_balance['CB_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] / credit_card_balance['AMT_BALANCE'] credit_card_balance['CB_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] credit_card_balance['CB_POS_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_BALANCE'] credit_card_balance['CB_POS_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] credit_card_balance['CB_DRAWING_ATM_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_DRAWINGS_CURRENT'] credit_card_balance['CB_DRAWING_POS_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_DRAWINGS_CURRENT'] credit_card_balance['CB_MIN_INSTALLMENT_BALANCE_RATIO'] = credit_card_balance['AMT_INST_MIN_REGULARITY'] / credit_card_balance['AMT_BALANCE'] credit_card_balance['CB_MIN_INSTALLMENT_DRAWING_RAITO'] = credit_card_balance['AMT_INST_MIN_REGULARITY'] / credit_card_balance['AMT_DRAWINGS_CURRENT'] credit_card_balance['CB_PAYMENT_BALANCE_RATIO'] = credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT'] / credit_card_balance['AMT_BALANCE'] credit_card_balance['CB_PAYMENT_LIMIT_RATIO'] = credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] credit_card_balance['CB_RECEIVABLE_PRINCIPAL_TOTAL_RATIO'] = credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] / credit_card_balance['AMT_TOTAL_RECEIVABLE']
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Read data'): pos_cash_balance = pd.read_csv('POS_CASH_balance.csv') print(pos_cash_balance.shape) with timer('Last month data'): pos_cash_balance_last_month = pos_cash_balance.sort_values( 'MONTHS_BALANCE').groupby(['SK_ID_PREV', 'SK_ID_CURR']).tail(1) print(pos_cash_balance_last_month.shape) pos_cash_balance_last_month = add_group_min(pos_cash_balance_last_month, cols=['SK_ID_CURR'], cname='PC_EARLIEST_MONTH', value='MONTHS_BALANCE') pos_cash_balance_last_month = add_group_max(pos_cash_balance_last_month, cols=['SK_ID_CURR'], cname='PC_LATEST_MONTH', value='MONTHS_BALANCE') pos_cash_balance_last_month = add_group_sum( pos_cash_balance_last_month, cols=['SK_ID_CURR'],
import gc import pickle import time import numpy as np import pandas as pd import matplotlib.pyplot as plt from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage from kaggle_learn.feature_engineering.statistics import * from sklearn.preprocessing import OneHotEncoder, LabelEncoder from instacart_utils import * %matplotlib inline with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') products = pd.read_csv('products.csv') prior = pd.read_csv('prior.csv') print(prior.shape) with timer('Reduce memory usage'): memory_usage() prior = reduce_memory_usage(prior) memory_usage() with timer('Product features'):
import numpy as np import pandas as pd from kaggle_learn.utils import timer with timer('Load data'): hist = pd.read_csv('hist_transac_processed.csv') print('historical transaction data: {}'.format(hist.shape)) with timer('Get feature dataframe base'): hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index() hist_feats.columns = ['card_id', 'hist_transac_count'] with timer('Transform purchase amount'): hist['purchase_amount'] = np.round( hist['purchase_amount'] / 0.00150265118 + 497.06, 2) with timer('Transaction time features'): for m in ['nunique', 'mean', 'std', 'min', 'skew']: hist_feats['hist_transac_monthlag_{}'.format(m)] = hist.groupby( ['card_id'])['month_lag'].agg([m]).values hist_feats['hist_purchase_date_last'] = hist.groupby( ['card_id'])['purchase_date'].max().values hist_feats['hist_purchase_date_first'] = hist.groupby( ['card_id'])['purchase_date'].min().values hist_feats['hist_purchase_date_diff_day'] = ( pd.to_datetime(hist_feats['hist_purchase_date_last']) - pd.to_datetime(hist_feats['hist_purchase_date_first'])).dt.days.values hist_feats['hist_purchase_count_ratio'] = hist_feats[ 'hist_transac_count'].values / ( 1. + hist_feats['hist_purchase_date_diff_day'].values)
import os import gc import pickle import time import numpy as np import pandas as pd from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import OneHotEncoder, LabelEncoder from kaggle_learn.utils import timer from kaggle_learn.feature_engineering.statistics import * with timer('Read data'): installments_payments = pd.read_csv('installments_payments.csv') print(installments_payments.shape) with timer('Find last instalment'): installments_payments = add_group_max(installments_payments, cols=['SK_ID_CURR'], cname='LAST_INSTALMENT', value='DAYS_INSTALMENT') installments_payments['IS_LAST_INSTALMENT'] = (installments_payments['DAYS_INSTALMENT'] == installments_payments['LAST_INSTALMENT']) installments_payments.loc[installments_payments['IS_LAST_INSTALMENT']][['SK_ID_CURR', 'SK_ID_PREV']].drop_duplicates(keep='last').to_csv('last_instalment_id.csv', index=False) with timer('Extract last instalment features'): installments_payments_last = installments_payments.loc[installments_payments['IS_LAST_INSTALMENT']] installments_payments_last.drop(['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'LAST_INSTALMENT', 'IS_LAST_INSTALMENT'], axis=1, inplace=True) installments_payments_last.columns = ['SK_ID_CURR'] + ['IP_LAST_'+c for c in installments_payments_last.columns.tolist()[1:]] print('installments_payments_last shape = {}'.format(installments_payments_last.shape)) installments_payments_last = installments_payments_last.groupby(['SK_ID_CURR']).mean().reset_index() print('installments_payments_last shape = {}'.format(installments_payments_last.shape)) with timer('Simple features'): installments_payments_last['IP_LAST_DPD'] = installments_payments_last['IP_LAST_DAYS_ENTRY_PAYMENT'] - installments_payments_last['IP_LAST_DAYS_INSTALMENT']
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('categorical_feature', help='categorical feature name') parser.add_argument('ngram', help='max ngram') parser.add_argument('max_features', help='maximum number of features') parser.add_argument('n_components', help='number of components') args = parser.parse_args() feat = str(args.categorical_feature) ngram = int(args.ngram) max_features = int(args.max_features) n_components = int(args.n_components) with timer('Load data'): hist = pd.read_csv('hist_transac_processed.csv') print(hist.shape) with timer('Convert {} to sequence'.format(feat)): hist[feat] = hist[feat].astype(str) hist_feat_seq = hist.sort_values('purchase_date').groupby( 'card_id')[feat].apply(list) hist_feat_seq = hist_feat_seq.reset_index() hist_feat_seq.columns = ['card_id', 'hist_{}_seq'.format(feat)] hist_feat_seq['hist_{}_seq'.format(feat)] = hist_feat_seq[ 'hist_{}_seq'.format(feat)].apply(lambda x: ' '.join(x)) with timer('Vectorizing {} sequence'.format(feat)): vectorizer = CountVectorizer(token_pattern='\w+', ngram_range=(1, ngram),
gp_2 = df.groupby(group)[value].count().reset_index() gp_2.columns = [group, 'cnt'] gp_3 = gp_2.merge(gp_1, on=group, how='left') gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] / gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt'] gp_3['entropy'].fillna(0, inplace=True) gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index() gp_4.columns = [group, cname] df_feats = df_feats.merge(gp_4, on=group, how='left') return df_feats with timer('Load data'): new = pd.read_csv('new_transac_processed.csv') print('new transaction data: {}'.format(new.shape)) with timer('Get feature dataframe base'): new_feats = pd.DataFrame(new.groupby(['card_id']).size()).reset_index() new_feats.columns = ['card_id', 'new_transac_count'] with timer('Basic new transaction features'): for c in ['city', 'state', 'merchant_category', 'subsector', 'merchant']: new_feats['new_transac_{}_nunique'.format(c)] = new.groupby( ['card_id'])['{}_id'.format(c)].nunique().values new_feats['new_transac_category_1_1_count'] = new.groupby( ['card_id'])['category_1'].sum().values new_feats['new_transac_category_1_0_count'] = new_feats[
sentences = [x.split(' ') for x in hist.values] n_features = 500 w2v = Word2Vec(sentences=sentences, min_count=1, size=n_features) w2v_features = apply_w2v(sentences, w2v, n_features) cluster_labels = KMeans(n_clusters=20).fit(w2v_features).labels_ cluster_labels = pd.Series(cluster_labels, name=category+'_cluster', index=gp_index) return df[category].map(cluster_labels).fillna(-1).astype(int) # ============================================ # Feature Extraction # ============================================ with timer('Load data'): train = pd.read_csv('train.csv.zip', parse_dates = ["activation_date"]) test = pd.read_csv('test.csv.zip', parse_dates = ["activation_date"]) ntrain = train.shape[0] ntest = test.shape[0] train['num_na'] = train.isnull().sum(axis=1) test['num_na'] = test.isnull().sum(axis=1) df = pd.concat([train, test], axis=0) del train, test gc.collect() with timer('Simple feature engineering'): df['dow'] = df['activation_date'].dt.weekday df['dom'] = df['activation_date'].dt.day df['param_1_len'] = df['param_1'].apply(lambda x: calc_len(x))
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split, KFold from kaggle_learn.utils import timer with open("df_full.pkl", "rb") as f: df_full = pickle.load(f) with open("features_category.pkl", "wb") as f: features_category = pickle.load(f) with open("features.pkl", "wb") as f: features = pickle.load(f) ntrain = 307507 with timer('Prepare / Train for LightGBM'): print('df_full shape = {}'.format(df_full.shape)) X_train_all = df_full.iloc[:ntrain][features].replace([np.inf, -np.inf], np.nan) y_train_all = df_full.iloc[:ntrain][target].replace([np.inf, -np.inf], np.nan) fold = KFold(n_splits=10, shuffle=True, random_state=42) X_test = df_full.iloc[ntrain:][features].replace([np.inf, -np.inf], np.nan) sub = pd.read_csv('sample_submission.csv') oof_preds_lgb_1 = np.zeros(X_train_all.shape[0]) sub_preds_lgb_1 = np.zeros(X_test.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(fold.split(X_train_all, y_train_all)): X_train = X_train_all.iloc[trn_idx]