import gc
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage
from kaggle_learn.feature_engineering.statistics import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    products = pd.read_csv('products.csv')
    prior = pd.read_csv('prior.csv')
    print(prior.shape)
	

with timer('Reduce memory usage'):
    memory_usage()
    prior = reduce_memory_usage(prior)
    memory_usage()
	
	
with timer('Simple features'):
    prior = add_group_mean(prior, cols=['order_id'], cname='order_product_reordered_mean', value='reordered')
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Read data'):
    previous_application = pd.read_csv('previous_application.csv')
    print(previous_application.shape)

with timer('Preprocessing'):
    previous_application['DAYS_FIRST_DRAWING'].replace(365243,
                                                       np.nan,
                                                       inplace=True)
    previous_application['DAYS_FIRST_DUE'].replace(365243,
                                                   np.nan,
                                                   inplace=True)
    previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243,
                                                              np.nan,
                                                              inplace=True)
    previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    previous_application['DAYS_TERMINATION'].replace(365243,
                                                     np.nan,
                                                     inplace=True)

with timer('Simple features'):
Exemple #3
0
import numpy as np
import pandas as pd
from kaggle_learn.utils import timer
from sklearn.decomposition import PCA

with timer('Load data'):
    hist = pd.read_csv('hist_transac_processed.csv')
    print('historical transaction data: {}'.format(hist.shape))
    hist = hist.loc[hist['authorized_flag'] == 1]
    print('historical transaction data (approved): {}'.format(hist.shape))

with timer('Get feature dataframe base'):
    hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index()
    hist_feats.columns = ['card_id', 'hist_transac_a_count']

with timer('Transform purchase amount'):
    hist['purchase_amount'] = np.round(hist['purchase_amount'] / 0.00150265118 + 497.06, 2)

with timer('Transaction amount features'):
    for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']:
        hist_feats['hist_transac_a_amount_{}'.format(m)] = hist.groupby(['card_id'])['purchase_amount'].agg([m]).values

    hist_feats['hist_transac_a_amount_diff'] = hist_feats['hist_transac_a_amount_max'].values - hist_feats['hist_transac_a_amount_min'].values

# refer from https://www.kaggle.com/fabiendaniel/elo-world?scriptVersionId=8335387
with timer('Transaction amount (monthly) features'):
    grp_1 = hist.groupby(['card_id', 'month_lag'])
    agg_func = {'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std', 'skew']}
    grp_2 = grp_1.agg(agg_func)
    grp_2.columns = ['_'.join(c).strip() for c in grp_2.columns.values]
Exemple #4
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from tqdm import tqdm
from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage
from kaggle_learn.feature_engineering.statistics import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    train = pd.read_csv('train_processed_0930.csv')
    test = pd.read_csv('test_processed_0930.csv')
    print(train.shape, test.shape)
	

with timer('Prepare features'):
    features = test.columns.tolist()
    features.remove('order_id')
    features.remove('user_id')
    features.remove('product_id')
    features.remove('order_number')
    features.remove('user_is_not_1st_order_count')
    categorical_features = ['order_dow', 'order_hour_of_day']
    print('Number of features = {}'.format(len(features)))
	
Exemple #5
0
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    order_products_prior = pd.read_csv('order_products__prior.csv')
    order_products_train = pd.read_csv('order_products__train.csv')
    orders = pd.read_csv('orders.csv')
    products = pd.read_csv('products.csv')
	
	
with timer('Get prior/train/test data'):
    sub = pd.read_csv('sample_submission.csv')
    prior = orders.loc[orders['eval_set'] == 'prior']
    train = orders.loc[orders['eval_set'] == 'train']
    test = orders.loc[orders['eval_set'] == 'test']
    print(prior.shape, train.shape, test.shape, sub.shape)
	

with timer('Process prior/train/test'):
    prior = order_products_prior.merge(prior, on=['order_id'], how='left')
    
    # get the train users all product history, later a binary classification will be applied on it than we can convert to the submission format 
    train_user_product_history = prior.loc[prior['user_id'].isin(train['user_id'].unique()), ['user_id', 'product_id']].drop_duplicates(keep='first')
Exemple #6
0
    gp_1 = df.groupby(full_group)[value].count().reset_index()
    gp_1.columns = full_group + ['subgroup_cnt']

    gp_2 = df.groupby(group)[value].count().reset_index()
    gp_2.columns = [group, 'cnt']

    gp_3 = gp_2.merge(gp_1, on=group, how='left')

    gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] / gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt']
    gp_3['entropy'].fillna(0, inplace=True)
    gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index()
    gp_4.columns = [group, cname]
    df_feats = df_feats.merge(gp_4, on=group, how='left')
    return df_feats

with timer('Load data'):
    hist = pd.read_csv('hist_transac_processed.csv')
    print('historical transaction data: {}'.format(hist.shape))
    hist = hist.loc[hist['authorized_flag'] == 1]
    print('historical transaction data (approved): {}'.format(hist.shape))

with timer('Get feature dataframe base'):
    hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index()
    hist_feats.columns = ['card_id', 'hist_transac_a_count']

with timer('Basic transaction info features'):
    for c in ['city', 'state', 'merchant_category', 'subsector', 'merchant']:
        hist_feats['hist_transac_a_{}_nunique'.format(c)] = hist.groupby(['card_id'])['{}_id'.format(c)].nunique().values

    hist_feats['hist_transac_a_category_1_1_count'] = hist.groupby(['card_id'])['category_1'].sum().values
    hist_feats['hist_transac_a_category_1_0_count'] = hist_feats['hist_transac_a_count'].values - hist_feats['hist_transac_a_category_1_1_count'].values
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Load data'):
    new = pd.read_csv('new_merchant_transactions.csv')

with timer('Get reference date (month)'):
    new['purchase_month'] = new['purchase_date'].astype(str).apply(
        lambda x: x[:10])
    new['reference_month'] = pd.to_datetime(
        new['purchase_month']) - new['month_lag'].apply(
            lambda x: np.timedelta64(x, 'M'))
    new['reference_month'] = new['reference_month'].astype(str).apply(
        lambda x: x[:7])
    new.drop(['purchase_month'], axis=1, inplace=True)

with timer('Convert categorical to int for new transactions'):
    cols = ['authorized_flag', 'category_1', 'category_3']
    lbl_encoder = LabelEncoder()
    for c in cols:
        new[c] = lbl_encoder.fit_transform(new[c].astype(str))

with timer('Generate simple / intermediate features'):
    new['month_lag=1'] = (new['month_lag'] == 1).astype(int)
    new['month_lag=2'] = (new['month_lag'] == 2).astype(int)

    new['category_2=1'] = (new['category_2'] == 1.).astype(int)
    new['category_2=2'] = (new['category_2'] == 2.).astype(int)
    new['category_2=3'] = (new['category_2'] == 3.).astype(int)
import gc
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage
from kaggle_learn.feature_engineering.statistics import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    products = pd.read_csv('products.csv')
    prior = pd.read_csv('prior.csv')
    print(prior.shape)
	

with timer('Reduce memory usage'):
    memory_usage()
    prior = reduce_memory_usage(prior)
    memory_usage()
	

with timer('User features'):
    # number of user orders
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Read data'):
    bureau = pd.read_csv('bureau.csv')
    bureau_balance = pd.read_csv('bureau_balance.csv')
    print(bureau.shape, bureau_balance.shape)
    
with timer('Join bureau_balance to bureau'):
    gp = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].count().reset_index()
    gp.columns = ['SK_ID_BUREAU', 'MONTHS_BALANCE_COUNT']
    
    # MONTHS_BALANCE_COUNT: for each SK_ID_CURR, it corresponding to multiple SK_ID_BUREAU
    # each SK_ID_BUREAU has a series of MONTHS_BALANCE
    bureau = bureau.merge(gp, on=['SK_ID_BUREAU'], how='left')
    del gp; gc.collect()
    
    gp = add_group_value_count(bureau_balance, cols=['SK_ID_BUREAU', 'STATUS'], value='MONTHS_BALANCE', prefix='BUB_')\
                .drop(['MONTHS_BALANCE', 'STATUS'], axis=1)\
                .drop_duplicates(keep='first')
    bureau = bureau.merge(gp, on=['SK_ID_BUREAU'], how='left')
    del gp; gc.collect()
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Load train and test data'):
    train = pd.read_csv('application_train.csv')
    test = pd.read_csv('application_test.csv')
    ntrain = train.shape[0]
    id_col = 'SK_ID_CURR'
    target = 'TARGET'
    df_full = pd.concat([train, test])
    print(train.shape, test.shape)
    
with timer('Preprocessing'):
    df_full['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df_full['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
    df_full = df_full.loc[df_full['CODE_GENDER'] != 'XNA']; ntrain = ntrain - 4
    
with timer('Get categorical features')
    features_category = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
                         'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                         'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE',
                         'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 

# ============================================
# Run a Ridge to get OOF (out-of-fold)
# ============================================

with open('features.pkl', 'rb') as f:
    features = pickle.load(f)

with open('tfidf.pkl', 'rb') as f:
    df_text_processed = pickle.load(f)

with open('df_reduced.pkl', 'rb') as f:
    df_reduced = pickle.load(f)

with timer('Load data'):
    train = pd.read_csv('train.csv.zip', parse_dates=["activation_date"])
    test = pd.read_csv('test.csv.zip', parse_dates=["activation_date"])
    ntrain = train.shape[0]
    ntest = test.shape[0]
    del train, test
    gc.collect()

with timer('Training ridge oof preds'):
    y_train_all = df_reduced['deal_probability'].iloc[:ntrain]

    if os.path.exists('ridge_preds.csv'):
        ridge_preds = pd.read_csv('ridge_preds.csv')
        df_reduced['ridge_preds'] = ridge_preds['ridge_preds'].values
        del ridge_preds
        gc.collect()
Exemple #12
0
        'nthread': 4
    }

    lgb_regressor = lgb.train(params=lgb_params,
                              train_set=X_train_lgb,
                              num_boost_round=1000)
    imp_df = pd.DataFrame()
    imp_df['feature'] = list(X_train.columns)
    imp_df['importance_gain'] = lgb_regressor.feature_importance(
        importance_type='gain')
    imp_df['importance_split'] = lgb_regressor.feature_importance(
        importance_type='split')
    return imp_df


with timer('Load data'):
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    hist_transac_info = pd.read_csv('hist_transac_info.csv')
    hist_transac_amount = pd.read_csv('hist_transac_amount.csv')
    hist_transac_time = pd.read_csv('hist_transac_time.csv')
    hist_transac_info_a = pd.read_csv('hist_transac_info_a.csv')
    hist_transac_amount_a = pd.read_csv('hist_transac_amount_a.csv')
    hist_transac_time_a = pd.read_csv('hist_transac_time_a.csv')
    hist_transac_merchant_lda_comp = pd.read_csv(
        'hist_transac_merchant_category_lda_comp_0.csv')
    hist_transac_merchant_lda_comp_2 = pd.read_csv(
        'hist_transac_merchant_category_lda_comp_2.csv')
    hist_transac_merchantid_lda_comp = pd.read_csv(
        'hist_transac_merchant_id_lda_comp_0_1.csv')
    print(hist_transac_info.shape, hist_transac_time.shape,
Exemple #13
0
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

with open('tfidf.pkl', 'rb') as f:
    df_text_processed = pickle.load(f)

with open('features.pkl', 'rb') as f:
    features = pickle.load(f)

with open('categorical_features.pkl', 'rb') as f:
    categorical_features = pickle.load(f)

with open('df_reduced.pkl', 'rb') as f:
    df_reduced = pickle.load(f)

with timer('Prepare for lightgbm'):

    X_train_all_dense = csr_matrix(df_reduced[features].iloc[:ntrain].values)
    X_train_all = hstack([X_train_all_dense, df_text_processed[:ntrain]])
    del X_train_all_dense

    X_train, X_val, y_train, y_val = train_test_split(X_train_all,
                                                      y_train_all,
                                                      test_size=.1,
                                                      random_state=42)

    X_test = hstack([
        df_reduced[features].iloc[ntrain:].values, df_text_processed[ntrain:]
    ])

    all_features = features + vocab
import numpy as np
import pandas as pd
from kaggle_learn.utils import timer

with timer('Load data'):
    hist = pd.read_csv('hist_transac_processed.csv', usecols=['card_id', 'merchant_id'])
    print('historical transaction data: {}'.format(hist.shape))

with timer('Merchant customer count'):
    hist_merchant = hist.groupby(['merchant_id']).size().reset_index()
    hist_merchant.columns = ['merchant_id', 'merchant_customer_count']
    print(hist_merchant.shape)

with timer('Merchant repurchase customer count'):
    hist_merchant_card = hist.groupby(['merchant_id', 'card_id']).size().reset_index()
    hist_merchant_card.columns = ['merchant_id', 'card_id', 'customer_visit_count']
    print(hist_merchant_card.shape)
    hist_merchant_card = hist_merchant_card.loc[hist_merchant_card['customer_visit_count'] > 1]
    print(hist_merchant_card.shape)

    # binary count
    hist_merchant_repurchase_binary = hist_merchant_card.groupby(['merchant_id']).size().reset_index()
    hist_merchant_repurchase_binary.columns = ['merchant_id', 'revisited_customers']
    hist_merchant_repurchase_binary['revisited_customers'].fillna(0.0, inplace=True)
    print(hist_merchant_repurchase_binary.shape)
    print(hist_merchant_repurchase_binary.head())

    # exact count
    hist_merchant_repurchase_exact = hist_merchant_card.groupby(['merchant_id'])['customer_visit_count'].sum().reset_index()
    hist_merchant_repurchase_exact.columns = ['merchant_id', 'revisited_count']
Exemple #15
0
import gc
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage
from kaggle_learn.feature_engineering.statistics import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    product_features = pd.read_csv('product_features.csv')
    product_w2v_features = pd.read_csv('product_w2v_features.csv')
    user_features = pd.read_csv('user_features.csv')
    user_product_features = pd.read_csv('user_x_product_features.csv')
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    print(train.shape, test.shape)
	
	
with timer('Reduce memory usage'):
    memory_usage()
    product_features = reduce_memory_usage(product_features)
    product_w2v_features = reduce_memory_usage(product_w2v_features)
    user_features = reduce_memory_usage(user_features)
    user_product_features = reduce_memory_usage(user_product_features)
Exemple #16
0
import numpy as np
import pandas as pd
from kaggle_learn.utils import timer

with timer('Load data'):
    new = pd.read_csv('new_transac_processed.csv')
    print('new transaction data: {}'.format(new.shape))

with timer('Get feature dataframe base'):
    new_feats = pd.DataFrame(new.groupby(['card_id']).size()).reset_index()
    new_feats.columns = ['card_id', 'new_transac_count']

with timer('Transform purchase amount'):
    new['purchase_amount'] = np.round(
        new['purchase_amount'] / 0.00150265118 + 497.06, 2)

with timer('Transaction amount features'):
    for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']:
        new_feats['new_transac_amount_{}'.format(m)] = new.groupby(
            ['card_id'])['purchase_amount'].agg([m]).values

    new_feats['new_transac_amount_diff'] = new_feats[
        'new_transac_amount_max'].values - new_feats[
            'new_transac_amount_min'].values

with timer('Transaction (time related) features'):
    new_monthsum_amount = new.groupby(
        ['card_id',
         'month_lag'])['purchase_amount'].sum().unstack().reset_index()
    new_feats[
        'new_transac_monthlag_last_1_amount'] = new_monthsum_amount.iloc[:,
Exemple #17
0
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from kaggle_learn.utils import timer

with open("df_full.pkl", "rb") as f:
    df_full = pickle.load(f)

with open("features_category.pkl", "wb") as f:
    features_category = pickle.load(f)

with open("features.pkl", "wb") as f:
    features = pickle.load(f)

ntrain = 307507

with timer('Prepare / Train for XGBoost'):
    features_numeric = list(set(features) - set(features_category))
    oh_encoder = OneHotEncoder()
    sparse_df = oh_encoder.fit_transform(df_full[features_category].values)
    sparse_df = csr_matrix(
        hstack((sparse_df, df_full[features_numeric].replace([np.inf, -np.inf],
                                                             np.nan))))
    sparse_train_all = sparse_df[:ntrain]
    sparse_test = sparse_df[ntrain:]
    del sparse_df
    gc.collect()

with timer('Prepare / Train for XGBoost'):
    print('df_full shape = {}'.format(df_full.shape))

    fold = KFold(n_splits=5, shuffle=True, random_state=42)
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from kaggle_learn.utils import timer, reduce_memory_usage
from kaggle_learn.feature_engineering.statistics import *

with timer('Load data'):
    hist = pd.read_csv('historical_transactions.csv')

with timer('Reduce memory usage of historical transaction'):
    hist = reduce_memory_usage(hist)

with timer('Get reference date (month)'):
    hist['purchase_month'] = hist['purchase_date'].astype(str).apply(
        lambda x: x[:7] + '-28')
    hist['reference_month'] = pd.to_datetime(
        hist['purchase_month']) - hist['month_lag'].apply(
            lambda x: np.timedelta64(x, 'M'))
    hist['reference_month'] = hist['reference_month'].astype(str).apply(
        lambda x: x[:7])
    hist.drop(['purchase_month'], axis=1, inplace=True)

with timer('Convert categorical to int for historical transactions'):
    cols = ['authorized_flag', 'category_1', 'category_3']
    lbl_encoder = LabelEncoder()
    for c in cols:
        hist[c] = lbl_encoder.fit_transform(hist[c].astype(str))

with timer('Generate simple / intermediate features'):
    hist['month_lag=0'] = (hist['month_lag'] == 0).astype(int)
    hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int)
    hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
Exemple #19
0
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Read data'):
    credit_card_balance = pd.read_csv('credit_card_balance.csv')
    print(credit_card_balance.shape)

with timer('Simple features'):
    credit_card_balance['CB_BALANCE_LIMIT_RATIO'] = credit_card_balance['AMT_BALANCE'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
    credit_card_balance['CB_ATM_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_BALANCE']
    credit_card_balance['CB_ATM_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
    credit_card_balance['CB_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] / credit_card_balance['AMT_BALANCE']
    credit_card_balance['CB_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
    credit_card_balance['CB_POS_DRAWING_BALANCE_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_BALANCE']
    credit_card_balance['CB_POS_DRAWING_LIMIT_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
    credit_card_balance['CB_DRAWING_ATM_RATIO'] = credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / credit_card_balance['AMT_DRAWINGS_CURRENT']
    credit_card_balance['CB_DRAWING_POS_RATIO'] = credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / credit_card_balance['AMT_DRAWINGS_CURRENT']
    credit_card_balance['CB_MIN_INSTALLMENT_BALANCE_RATIO'] = credit_card_balance['AMT_INST_MIN_REGULARITY'] / credit_card_balance['AMT_BALANCE']
    credit_card_balance['CB_MIN_INSTALLMENT_DRAWING_RAITO'] = credit_card_balance['AMT_INST_MIN_REGULARITY'] / credit_card_balance['AMT_DRAWINGS_CURRENT']
    credit_card_balance['CB_PAYMENT_BALANCE_RATIO'] = credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT'] / credit_card_balance['AMT_BALANCE']
    credit_card_balance['CB_PAYMENT_LIMIT_RATIO'] = credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
    credit_card_balance['CB_RECEIVABLE_PRINCIPAL_TOTAL_RATIO'] = credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] / credit_card_balance['AMT_TOTAL_RECEIVABLE']
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Read data'):
    pos_cash_balance = pd.read_csv('POS_CASH_balance.csv')
    print(pos_cash_balance.shape)

with timer('Last month data'):
    pos_cash_balance_last_month = pos_cash_balance.sort_values(
        'MONTHS_BALANCE').groupby(['SK_ID_PREV', 'SK_ID_CURR']).tail(1)
    print(pos_cash_balance_last_month.shape)

    pos_cash_balance_last_month = add_group_min(pos_cash_balance_last_month,
                                                cols=['SK_ID_CURR'],
                                                cname='PC_EARLIEST_MONTH',
                                                value='MONTHS_BALANCE')
    pos_cash_balance_last_month = add_group_max(pos_cash_balance_last_month,
                                                cols=['SK_ID_CURR'],
                                                cname='PC_LATEST_MONTH',
                                                value='MONTHS_BALANCE')
    pos_cash_balance_last_month = add_group_sum(
        pos_cash_balance_last_month,
        cols=['SK_ID_CURR'],
import gc
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kaggle_learn.utils import timer, reduce_memory_usage, memory_usage
from kaggle_learn.feature_engineering.statistics import *
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from instacart_utils import *

%matplotlib inline


with timer('Load data'):
    aisles = pd.read_csv('aisles.csv')
    departments = pd.read_csv('departments.csv')
    products = pd.read_csv('products.csv')
    prior = pd.read_csv('prior.csv')
    print(prior.shape)
	

with timer('Reduce memory usage'):
    memory_usage()
    prior = reduce_memory_usage(prior)
    memory_usage()
	

with timer('Product features'):
    
import numpy as np
import pandas as pd
from kaggle_learn.utils import timer

with timer('Load data'):
    hist = pd.read_csv('hist_transac_processed.csv')
    print('historical transaction data: {}'.format(hist.shape))

with timer('Get feature dataframe base'):
    hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index()
    hist_feats.columns = ['card_id', 'hist_transac_count']

with timer('Transform purchase amount'):
    hist['purchase_amount'] = np.round(
        hist['purchase_amount'] / 0.00150265118 + 497.06, 2)

with timer('Transaction time features'):
    for m in ['nunique', 'mean', 'std', 'min', 'skew']:
        hist_feats['hist_transac_monthlag_{}'.format(m)] = hist.groupby(
            ['card_id'])['month_lag'].agg([m]).values

    hist_feats['hist_purchase_date_last'] = hist.groupby(
        ['card_id'])['purchase_date'].max().values
    hist_feats['hist_purchase_date_first'] = hist.groupby(
        ['card_id'])['purchase_date'].min().values
    hist_feats['hist_purchase_date_diff_day'] = (
        pd.to_datetime(hist_feats['hist_purchase_date_last']) -
        pd.to_datetime(hist_feats['hist_purchase_date_first'])).dt.days.values
    hist_feats['hist_purchase_count_ratio'] = hist_feats[
        'hist_transac_count'].values / (
            1. + hist_feats['hist_purchase_date_diff_day'].values)
Exemple #23
0
import os
import gc
import pickle
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from kaggle_learn.utils import timer
from kaggle_learn.feature_engineering.statistics import *

with timer('Read data'):
    installments_payments = pd.read_csv('installments_payments.csv')
    print(installments_payments.shape)
    
with timer('Find last instalment'):
    installments_payments = add_group_max(installments_payments, cols=['SK_ID_CURR'], cname='LAST_INSTALMENT', value='DAYS_INSTALMENT')
    installments_payments['IS_LAST_INSTALMENT'] = (installments_payments['DAYS_INSTALMENT'] == installments_payments['LAST_INSTALMENT'])
    installments_payments.loc[installments_payments['IS_LAST_INSTALMENT']][['SK_ID_CURR', 'SK_ID_PREV']].drop_duplicates(keep='last').to_csv('last_instalment_id.csv', index=False)
    
with timer('Extract last instalment features'):
    installments_payments_last = installments_payments.loc[installments_payments['IS_LAST_INSTALMENT']]
    installments_payments_last.drop(['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'LAST_INSTALMENT', 'IS_LAST_INSTALMENT'], axis=1, inplace=True)
    installments_payments_last.columns = ['SK_ID_CURR'] + ['IP_LAST_'+c for c in installments_payments_last.columns.tolist()[1:]]
    print('installments_payments_last shape = {}'.format(installments_payments_last.shape))
    installments_payments_last = installments_payments_last.groupby(['SK_ID_CURR']).mean().reset_index()
    print('installments_payments_last shape = {}'.format(installments_payments_last.shape))

with timer('Simple features'):
    installments_payments_last['IP_LAST_DPD'] = installments_payments_last['IP_LAST_DAYS_ENTRY_PAYMENT'] - installments_payments_last['IP_LAST_DAYS_INSTALMENT']
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('categorical_feature', help='categorical feature name')
    parser.add_argument('ngram', help='max ngram')
    parser.add_argument('max_features', help='maximum number of features')
    parser.add_argument('n_components', help='number of components')
    args = parser.parse_args()

    feat = str(args.categorical_feature)
    ngram = int(args.ngram)
    max_features = int(args.max_features)
    n_components = int(args.n_components)

    with timer('Load data'):
        hist = pd.read_csv('hist_transac_processed.csv')
        print(hist.shape)

    with timer('Convert {} to sequence'.format(feat)):
        hist[feat] = hist[feat].astype(str)
        hist_feat_seq = hist.sort_values('purchase_date').groupby(
            'card_id')[feat].apply(list)
        hist_feat_seq = hist_feat_seq.reset_index()
        hist_feat_seq.columns = ['card_id', 'hist_{}_seq'.format(feat)]
        hist_feat_seq['hist_{}_seq'.format(feat)] = hist_feat_seq[
            'hist_{}_seq'.format(feat)].apply(lambda x: ' '.join(x))

    with timer('Vectorizing {} sequence'.format(feat)):
        vectorizer = CountVectorizer(token_pattern='\w+',
                                     ngram_range=(1, ngram),
    gp_2 = df.groupby(group)[value].count().reset_index()
    gp_2.columns = [group, 'cnt']

    gp_3 = gp_2.merge(gp_1, on=group, how='left')

    gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] /
                              gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt']
    gp_3['entropy'].fillna(0, inplace=True)
    gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index()
    gp_4.columns = [group, cname]
    df_feats = df_feats.merge(gp_4, on=group, how='left')
    return df_feats


with timer('Load data'):
    new = pd.read_csv('new_transac_processed.csv')
    print('new transaction data: {}'.format(new.shape))

with timer('Get feature dataframe base'):
    new_feats = pd.DataFrame(new.groupby(['card_id']).size()).reset_index()
    new_feats.columns = ['card_id', 'new_transac_count']

with timer('Basic new transaction features'):
    for c in ['city', 'state', 'merchant_category', 'subsector', 'merchant']:
        new_feats['new_transac_{}_nunique'.format(c)] = new.groupby(
            ['card_id'])['{}_id'.format(c)].nunique().values

    new_feats['new_transac_category_1_1_count'] = new.groupby(
        ['card_id'])['category_1'].sum().values
    new_feats['new_transac_category_1_0_count'] = new_feats[
Exemple #26
0
    
    sentences = [x.split(' ') for x in hist.values]
    n_features = 500
    w2v = Word2Vec(sentences=sentences, min_count=1, size=n_features)
    w2v_features = apply_w2v(sentences, w2v, n_features)
    cluster_labels = KMeans(n_clusters=20).fit(w2v_features).labels_
    cluster_labels = pd.Series(cluster_labels, name=category+'_cluster',
                              index=gp_index)
    return df[category].map(cluster_labels).fillna(-1).astype(int)


# ============================================
# Feature Extraction
# ============================================

with timer('Load data'):
    train = pd.read_csv('train.csv.zip', parse_dates = ["activation_date"])
    test = pd.read_csv('test.csv.zip', parse_dates = ["activation_date"])
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train['num_na'] = train.isnull().sum(axis=1)
    test['num_na'] = test.isnull().sum(axis=1)
    df = pd.concat([train, test], axis=0)
    del train, test
    gc.collect()

with timer('Simple feature engineering'):
    df['dow'] = df['activation_date'].dt.weekday
    df['dom'] = df['activation_date'].dt.day
    df['param_1_len'] = df['param_1'].apply(lambda x: calc_len(x))
    
Exemple #27
0
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from kaggle_learn.utils import timer

with open("df_full.pkl", "rb") as f:
    df_full = pickle.load(f)

with open("features_category.pkl", "wb") as f:
    features_category = pickle.load(f)

with open("features.pkl", "wb") as f:
    features = pickle.load(f)

ntrain = 307507

with timer('Prepare / Train for LightGBM'):
    print('df_full shape = {}'.format(df_full.shape))
    X_train_all = df_full.iloc[:ntrain][features].replace([np.inf, -np.inf],
                                                          np.nan)
    y_train_all = df_full.iloc[:ntrain][target].replace([np.inf, -np.inf],
                                                        np.nan)
    fold = KFold(n_splits=10, shuffle=True, random_state=42)

    X_test = df_full.iloc[ntrain:][features].replace([np.inf, -np.inf], np.nan)
    sub = pd.read_csv('sample_submission.csv')
    oof_preds_lgb_1 = np.zeros(X_train_all.shape[0])
    sub_preds_lgb_1 = np.zeros(X_test.shape[0])

    for n_fold, (trn_idx,
                 val_idx) in enumerate(fold.split(X_train_all, y_train_all)):
        X_train = X_train_all.iloc[trn_idx]