Example #1
0
    # get the train users all product history, later a binary classification will be applied on it than we can convert to the submission format 
    train_user_product_history = prior.loc[prior['user_id'].isin(train['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first')
    train = train.merge(train_user_product_history, on=['user_id'], how='right')
    del train_user_product_history
    gc.collect()
    
    # get the test users all product history, later a binary classification will be applied on it than we can convert to the submission format 
    test_user_product_history = prior.loc[prior['user_id'].isin(test['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first')
    test = test.merge(test_user_product_history, on=['user_id'], how='right')
    del test_user_product_history
    gc.collect()
    
    train = train.merge(order_products_train, on=['order_id', 'product_id'], how='left')
    
    print(prior.shape, train.shape, test.shape)
	

with timer('Save transformed data'):
    del prior['eval_set']
    del train['eval_set']
    del test['eval_set']
    gc.collect()
    prior = reduce_memory_usage(prior)
    train = reduce_memory_usage(train)
    test = reduce_memory_usage(test)
    
    prior.to_csv('prior.csv', index=False)
    train.to_csv('train.csv', index=False)
    test.to_csv('test.csv', index=False)
	
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from kaggle_learn.utils import timer, reduce_memory_usage
from kaggle_learn.feature_engineering.statistics import *

with timer('Load data'):
    hist = pd.read_csv('historical_transactions.csv')

with timer('Reduce memory usage of historical transaction'):
    hist = reduce_memory_usage(hist)

with timer('Get reference date (month)'):
    hist['purchase_month'] = hist['purchase_date'].astype(str).apply(
        lambda x: x[:7] + '-28')
    hist['reference_month'] = pd.to_datetime(
        hist['purchase_month']) - hist['month_lag'].apply(
            lambda x: np.timedelta64(x, 'M'))
    hist['reference_month'] = hist['reference_month'].astype(str).apply(
        lambda x: x[:7])
    hist.drop(['purchase_month'], axis=1, inplace=True)

with timer('Convert categorical to int for historical transactions'):
    cols = ['authorized_flag', 'category_1', 'category_3']
    lbl_encoder = LabelEncoder()
    for c in cols:
        hist[c] = lbl_encoder.fit_transform(hist[c].astype(str))

with timer('Generate simple / intermediate features'):
    hist['month_lag=0'] = (hist['month_lag'] == 0).astype(int)
    hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int)
    hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    products = pd.read_csv('products.csv')
    prior = pd.read_csv('prior.csv')
    print(prior.shape)
	

with timer('Reduce memory usage'):
    memory_usage()
    prior = reduce_memory_usage(prior)
    memory_usage()
	

with timer('Product features'):
    
    # number of being ordered, number of users order the product, and the ratio
    prior = add_group_count(prior, cols=['product_id'], cname='product_bought_count', value='user_id')
    prior = add_group_nunique(prior, cols=['product_id'], cname='product_bought_user_count', value='user_id')
    prior['product_relative_popularity'] = prior['product_bought_user_count'] / (prior['product_bought_count'] + 1)
    
    # number of being reordered, and the ratio with number of being ordered
    prior = add_group_sum(prior, cols=['product_id'], cname='product_reordered_count', value='reordered')
    prior['product_reordered_ratio'] = prior['product_reordered_count'] / prior['product_bought_count']
    
    # number of being 1st time ordered, 2nd time ordered, and the ratio
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    products = pd.read_csv('products.csv')
    prior = pd.read_csv('prior.csv')
    print(prior.shape)
	

with timer('Reduce memory usage'):
    memory_usage()
    prior = reduce_memory_usage(prior)
    memory_usage()
	

with timer('User features'):
    # number of user orders
    prior = add_group_max(prior, cols=['user_id'], cname='user_order_count', value='order_number')
    
    # number of user order product, unique product, reorder product
    prior = add_group_count(prior, cols=['user_id'], cname='user_products_count', value='product_id')
    prior = add_group_nunique(prior, cols=['user_id'], cname='user_products_nunique', value='product_id')
    prior = add_group_sum(prior, cols=['user_id'], cname='user_reordered_sum', value='reordered')
    gp = prior.groupby(['user_id', 'product_id'])['reordered'].sum().reset_index()
    gp['is_reordered_product'] = (gp['reordered'] > 1).astype(int)
    gp = gp.groupby('user_id')['is_reordered_product'].sum()
    gp_df = pd.DataFrame(gp.values, columns=['user_product_reordered_nunique'])
Example #5
0
%matplotlib inline


with timer('Load data'):
    product_features = pd.read_csv('product_features.csv')
    product_w2v_features = pd.read_csv('product_w2v_features.csv')
    user_features = pd.read_csv('user_features.csv')
    user_product_features = pd.read_csv('user_x_product_features.csv')
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    print(train.shape, test.shape)
	
	
with timer('Reduce memory usage'):
    memory_usage()
    product_features = reduce_memory_usage(product_features)
    product_w2v_features = reduce_memory_usage(product_w2v_features)
    user_features = reduce_memory_usage(user_features)
    user_product_features = reduce_memory_usage(user_product_features)
    train = reduce_memory_usage(train)
    test = reduce_memory_usage(test)
    memory_usage()
	
	
with timer('Merge features'):
    train = train.merge(product_features, on='product_id', how='left')
    test = test.merge(product_features, on='product_id', how='left')
    del product_features
    gc.collect()
    
    train = train.merge(product_w2v_features, on='product_id', how='left')