# get the train users all product history, later a binary classification will be applied on it than we can convert to the submission format train_user_product_history = prior.loc[prior['user_id'].isin(train['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first') train = train.merge(train_user_product_history, on=['user_id'], how='right') del train_user_product_history gc.collect() # get the test users all product history, later a binary classification will be applied on it than we can convert to the submission format test_user_product_history = prior.loc[prior['user_id'].isin(test['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first') test = test.merge(test_user_product_history, on=['user_id'], how='right') del test_user_product_history gc.collect() train = train.merge(order_products_train, on=['order_id', 'product_id'], how='left') print(prior.shape, train.shape, test.shape) with timer('Save transformed data'): del prior['eval_set'] del train['eval_set'] del test['eval_set'] gc.collect() prior = reduce_memory_usage(prior) train = reduce_memory_usage(train) test = reduce_memory_usage(test) prior.to_csv('prior.csv', index=False) train.to_csv('train.csv', index=False) test.to_csv('test.csv', index=False)
import pandas as pd from sklearn.preprocessing import LabelEncoder from kaggle_learn.utils import timer, reduce_memory_usage from kaggle_learn.feature_engineering.statistics import * with timer('Load data'): hist = pd.read_csv('historical_transactions.csv') with timer('Reduce memory usage of historical transaction'): hist = reduce_memory_usage(hist) with timer('Get reference date (month)'): hist['purchase_month'] = hist['purchase_date'].astype(str).apply( lambda x: x[:7] + '-28') hist['reference_month'] = pd.to_datetime( hist['purchase_month']) - hist['month_lag'].apply( lambda x: np.timedelta64(x, 'M')) hist['reference_month'] = hist['reference_month'].astype(str).apply( lambda x: x[:7]) hist.drop(['purchase_month'], axis=1, inplace=True) with timer('Convert categorical to int for historical transactions'): cols = ['authorized_flag', 'category_1', 'category_3'] lbl_encoder = LabelEncoder() for c in cols: hist[c] = lbl_encoder.fit_transform(hist[c].astype(str)) with timer('Generate simple / intermediate features'): hist['month_lag=0'] = (hist['month_lag'] == 0).astype(int) hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int) hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
from instacart_utils import * %matplotlib inline with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') products = pd.read_csv('products.csv') prior = pd.read_csv('prior.csv') print(prior.shape) with timer('Reduce memory usage'): memory_usage() prior = reduce_memory_usage(prior) memory_usage() with timer('Product features'): # number of being ordered, number of users order the product, and the ratio prior = add_group_count(prior, cols=['product_id'], cname='product_bought_count', value='user_id') prior = add_group_nunique(prior, cols=['product_id'], cname='product_bought_user_count', value='user_id') prior['product_relative_popularity'] = prior['product_bought_user_count'] / (prior['product_bought_count'] + 1) # number of being reordered, and the ratio with number of being ordered prior = add_group_sum(prior, cols=['product_id'], cname='product_reordered_count', value='reordered') prior['product_reordered_ratio'] = prior['product_reordered_count'] / prior['product_bought_count'] # number of being 1st time ordered, 2nd time ordered, and the ratio
from instacart_utils import * %matplotlib inline with timer('Load data'): aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') products = pd.read_csv('products.csv') prior = pd.read_csv('prior.csv') print(prior.shape) with timer('Reduce memory usage'): memory_usage() prior = reduce_memory_usage(prior) memory_usage() with timer('User features'): # number of user orders prior = add_group_max(prior, cols=['user_id'], cname='user_order_count', value='order_number') # number of user order product, unique product, reorder product prior = add_group_count(prior, cols=['user_id'], cname='user_products_count', value='product_id') prior = add_group_nunique(prior, cols=['user_id'], cname='user_products_nunique', value='product_id') prior = add_group_sum(prior, cols=['user_id'], cname='user_reordered_sum', value='reordered') gp = prior.groupby(['user_id', 'product_id'])['reordered'].sum().reset_index() gp['is_reordered_product'] = (gp['reordered'] > 1).astype(int) gp = gp.groupby('user_id')['is_reordered_product'].sum() gp_df = pd.DataFrame(gp.values, columns=['user_product_reordered_nunique'])
%matplotlib inline with timer('Load data'): product_features = pd.read_csv('product_features.csv') product_w2v_features = pd.read_csv('product_w2v_features.csv') user_features = pd.read_csv('user_features.csv') user_product_features = pd.read_csv('user_x_product_features.csv') train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') print(train.shape, test.shape) with timer('Reduce memory usage'): memory_usage() product_features = reduce_memory_usage(product_features) product_w2v_features = reduce_memory_usage(product_w2v_features) user_features = reduce_memory_usage(user_features) user_product_features = reduce_memory_usage(user_product_features) train = reduce_memory_usage(train) test = reduce_memory_usage(test) memory_usage() with timer('Merge features'): train = train.merge(product_features, on='product_id', how='left') test = test.merge(product_features, on='product_id', how='left') del product_features gc.collect() train = train.merge(product_w2v_features, on='product_id', how='left')