Esempio n. 1
0
def make_features(p):
    if p == 0:
        df = train
        name = 'train'
    else:
        df = test
        name = 'test'

    # get vec
    print(name, 'sum', round(st_time - time(), 4))
    question_vec_sum = pd.DataFrame(
        list(df['q_stop'].map(get_vector_with_words_sum))).add_prefix(
            'q_stop_vec_sum_')
    answer_vec_sum = pd.DataFrame(
        list(df['a_stop'].map(get_vector_with_words_sum))).add_prefix(
            'a_stop_vec_sum_')

    print(name, 'mean', round(st_time - time(), 4))
    question_vec_mean = pd.DataFrame(
        list(df['q_stop'].map(get_vector_with_words_mean))).add_prefix(
            'q_stop_vec_mean_')
    answer_vec_mean = pd.DataFrame(
        list(df['a_stop'].map(get_vector_with_words_mean))).add_prefix(
            'a_stop_vec_mean_')

    result = pd.concat(
        [question_vec_sum, answer_vec_sum, question_vec_mean, answer_vec_mean],
        axis=1)

    utils.to_pickles(result, f'../data/202_{name}', utils.SPLIT_SIZE)
def pivot(cat):
    li = []
    pt = pd.pivot_table(prev, index=KEY, columns=cat, values=col_num)
    pt.columns = [f'{PREF}_{c[0]}-{c[1]}_mean' for c in pt.columns]
    li.append(pt)
    pt = pd.pivot_table(prev,
                        index=KEY,
                        columns=cat,
                        values=col_num,
                        aggfunc=np.sum)
    pt.columns = [f'{PREF}_{c[0]}-{c[1]}_sum' for c in pt.columns]
    li.append(pt)
    pt = pd.pivot_table(prev,
                        index=KEY,
                        columns=cat,
                        values=col_num,
                        aggfunc=np.std,
                        fill_value=-1)
    pt.columns = [f'{PREF}_{c[0]}-{c[1]}_std' for c in pt.columns]
    li.append(pt)
    base = pd.concat(li, axis=1).reset_index()
    base.reset_index(inplace=True)
    del li, pt
    gc.collect()

    df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/110_{cat}_train', utils.SPLIT_SIZE)
    gc.collect()

    df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/110_{cat}_test', utils.SPLIT_SIZE)
    gc.collect()
def multi(c1):
    global base
    print(c1)
    df_sum = pd.crosstab(prev[KEY], prev[c1])
    df_sum.columns = [
        f'{PREF}_{c1}_{str(c2).replace(" ", "-")}_sum' for c2 in df_sum.columns
    ]
    df_norm = pd.crosstab(prev[KEY], prev[c1], normalize='index')
    df_norm.columns = [
        f'{PREF}_{c1}_{str(c2).replace(" ", "-")}_norm'
        for c2 in df_norm.columns
    ]
    df = pd.concat([df_sum, df_norm], axis=1)

    col = df.columns.tolist()
    base = pd.concat([base, df], axis=1)
    base[col] = base[col].fillna(-1)
    base.reset_index(inplace=True)

    df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/110_{c1}_train', utils.SPLIT_SIZE)
    gc.collect()

    df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/110_{c1}_test', utils.SPLIT_SIZE)
    gc.collect()
def multi(k):
    """
    k = 'app'
    """
    gc.collect()
    print(k)

    df = pd.crosstab(trte[k], trte.hour, normalize='index')
    df = df.add_prefix(f'histHourNorm_{k}_')

    utils.reduce_memory(df)
    col = df.columns.tolist()

    result = pd.merge(trte, df.reset_index(), on=k, how='left')
    gc.collect()

    #    result.iloc[0:utils.TRAIN_SHAPE][col].to_pickle(f'../data/114__{k}_train.p')
    #    result.iloc[utils.TRAIN_SHAPE:][col].to_pickle(f'../data/114__{k}_test.p')
    #    gc.collect()

    utils.to_pickles(
        result.iloc[0:utils.TRAIN_SHAPE][col].reset_index(drop=True),
        '../data/114_train', utils.SPLIT_SIZE)
    gc.collect()
    utils.to_pickles(
        result.iloc[utils.TRAIN_SHAPE:][col].reset_index(drop=True),
        '../data/114_test', utils.SPLIT_SIZE)
Esempio n. 5
0
def make_features(p):
    if p == 0:
        df = train
        name = 'train'
    else:
        df = test
        name = 'test'
    init_col = df.columns.tolist()
    print(init_col)

    df['qlenchar'] = df.question_text.apply(len)
    df['qlenword'] = df.question_text.apply(lambda x: len(splittext(x)))
    df['alenchar'] = df.answer_text.apply(len)
    df['alenword'] = df.answer_text.apply(lambda x: len(splittext(x)))

    df['difflenchar'] = df.qlenchar - df.alenchar
    df['difflenword'] = df.qlenword - df.alenword

    df['divlenchar'] = df.qlenchar / df.alenchar
    df['divlenword'] = df.qlenword / df.alenword

    df['idivlenchar'] = df.alenchar / df.qlenchar
    df['idivlenword'] = df.alenword / df.qlenword

    #    df['subreddit_le'] = LabelEncoder().fit_transform(df.subreddit)
    #    df['qid'] = LabelEncoder().fit_transform(df.question_id)
    df = pd.get_dummies(df, columns=['subreddit'])
    init_col.remove('subreddit')

    df['qdt_dow'] = pd.to_datetime(df.question_utc, origin='unix',
                                   unit='s').dt.dayofweek
    df['qdt_hour'] = pd.to_datetime(df.question_utc, origin='unix',
                                    unit='s').dt.hour

    df['adt_dow'] = pd.to_datetime(df.answer_utc, origin='unix',
                                   unit='s').dt.dayofweek
    df['adt_hour'] = pd.to_datetime(df.answer_utc, origin='unix',
                                    unit='s').dt.hour

    #    df['question_score_l1p'] = np.log1p(df.question_score)
    #    df['answer_score_l1p'] = np.log1p(df.answer_score)

    df['qboldwords'] = df.question_text.apply(
        lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1))
    df['aboldwords'] = df.answer_text.apply(
        lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1))

    df.drop(init_col, axis=1, inplace=True)
    print(name, df.columns.tolist())
    utils.to_pickles(df, f'../data/701_{name}', utils.SPLIT_SIZE)
Esempio n. 6
0
def multi(p):
    if p == 0:
        # =============================================================================
        # test
        # =============================================================================

        print('loading test_old...')
        test_old = pd.read_csv(
            '../input/test_old.csv.gz',
            dtype=dtypes,
            parse_dates=['click_time']).sort_values(
                utils.sort_keys)  # be sure to sort by this keys

        print('loading test...')
        test = pd.read_csv('../input/test.csv.zip',
                           dtype=dtypes,
                           parse_dates=['click_time']).sort_values(
                               utils.sort_keys).reset_index(drop=True)
        print('finish loading!')

        merge_key = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
        test_old.drop('click_id', axis=1, inplace=True)
        test_old = pd.merge(test_old,
                            test[merge_key + ['click_id']],
                            on=merge_key,
                            how='left')

        utils.to_pickles(test_old, '../data/test_old', utils.SPLIT_SIZE)
        #        utils.to_pickles(test_old.sort_values(utils.sort_keys, ascending=False),
        #                         '../data/test_old_rev',  10)
        utils.to_pickles(test, '../data/test', utils.SPLIT_SIZE)

        del test_old, test
        gc.collect()

    else:
        # =============================================================================
        # train
        # =============================================================================

        print('loading train...')
        train = pd.read_csv(
            '../input/train.csv.zip',
            dtype=dtypes,
            parse_dates=['click_time', 'attributed_time']).sort_values(
                utils.sort_keys)  # be sure to sort by this keys
        print('finish loading!')

        #        print('drop os; 607, 748, 866')
        #        train = train[~train.os.isin([607, 748, 866])].reset_index(drop=True)
        print('train.shape', train.shape)
        utils.to_pickles(train, '../data/train', utils.SPLIT_SIZE)
        utils.to_pickles(train.is_attributed, '../data/is_attributed',
                         utils.SPLIT_SIZE)
        #        utils.to_pickles(train.sort_values(utils.sort_keys, ascending=False),
        #                         '../data/train_rev', 10)

        del train
        gc.collect()
def read_pickle(folder, usecols):
    df = pd.read_pickle(folder+'/000.p')
    col = list( set(usecols) & set(df.columns))
    print(folder, len(col))
    if len(col)>0:
        df = utils.read_pickles(folder, col)
        utils.to_pickles(df, folder+'_filtered', utils.SPLIT_SIZE)
        del df; gc.collect()
        
        folder = folder.replace('_train', '_test')
        df = utils.read_pickles(folder, col)
        utils.to_pickles(df, folder+'_filtered', utils.SPLIT_SIZE)
        
    else:
        print(f'{folder} doesnt have valid features')
        pass
Esempio n. 8
0
def make_features(p):
    if p == 0:
        df = train
        name = 'train'
    else:
        df = test
        name = 'test'

    init_col = df.columns.tolist()

    vec_df = df.apply(get_sim, axis=1)
    df['cosine_sim_stop'] = vec_df.apply(lambda x: x[0])
    df['cosine_sim_mean_stop'] = vec_df.apply(lambda x: x[1])

    df.drop(init_col, axis=1, inplace=True)

    utils.to_pickles(df, f'../data/204_{name}', utils.SPLIT_SIZE)
Esempio n. 9
0
def multi(p):
    if p == 0:
        train['day'] = train.click_time.dt.day
        train['hour'] = train.click_time.dt.hour
        train['hour_min'] = train['hour'] + (train.click_time.dt.minute / 60)
        train['timestamp'] = (train.click_time - min_time).dt.seconds

        col = ['day', 'hour', 'hour_min', 'timestamp']
        utils.to_pickles(train[col], '../data/001_train', utils.SPLIT_SIZE)

    elif p == 1:
        test['day'] = test.click_time.dt.day
        test['hour'] = test.click_time.dt.hour
        test['hour_min'] = test['hour'] + (test.click_time.dt.minute / 60)
        test['timestamp'] = (test.click_time - min_time).dt.seconds

        col = ['day', 'hour', 'hour_min', 'timestamp']
        utils.to_pickles(test[col], '../data/001_test', utils.SPLIT_SIZE)
Esempio n. 10
0
def concat_pred_item(T, dryrun=False):
    if T==-1:
        name = 'test'
    else:
        name = 'trainT-'+str(T)
    
    df = utils.load_pred_item(name)
    
    df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)), 
                  on=['user_id', 'product_id'],how='left')
    
    gc.collect()
    
    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)
Esempio n. 11
0
def pivot(cat):
    li = []
    pt = pd.pivot_table(bureau, index=KEY, columns=cat, values=col_num)
    pt.columns = [
        f'{PREF}_{cat}_{c[0]}-{c[1]}_mean'.replace(' ', '-')
        for c in pt.columns
    ]
    li.append(pt)
    pt = pd.pivot_table(bureau,
                        index=KEY,
                        columns=cat,
                        values=col_num,
                        aggfunc=np.sum)
    pt.columns = [
        f'{PREF}_{cat}_{c[0]}-{c[1]}_sum'.replace(' ', '-') for c in pt.columns
    ]
    li.append(pt)
    pt = pd.pivot_table(bureau,
                        index=KEY,
                        columns=cat,
                        values=col_num,
                        aggfunc=np.std,
                        fill_value=-1)
    pt.columns = [
        f'{PREF}_{cat}_{c[0]}-{c[1]}_std'.replace(' ', '-') for c in pt.columns
    ]
    li.append(pt)
    feat = pd.concat(li, axis=1).reset_index()
    del li, pt
    gc.collect()

    df = pd.merge(train, feat, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/tmp_504_{cat}_train', utils.SPLIT_SIZE)
    gc.collect()

    df = pd.merge(test, feat, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/tmp_504_{cat}_test', utils.SPLIT_SIZE)
    gc.collect()
Esempio n. 12
0
def make_features(p):
    if p == 0:
        df = train
        name = 'train'
    else:
        df = test
        name = 'test'

    init_col = df.columns.tolist()

    gr = df.groupby('question_id')

    for c in USECOLS:
        print(name, c)
        df[f'{c}_min'] = gr[c].transform(np.min)
        df[f'{c}_max'] = gr[c].transform(np.max)
        df[f'{c}_max-min'] = df[f'{c}_max'] - df[f'{c}_min']
        df[f'{c}_mean'] = gr[c].transform(np.mean)
        df[f'{c}_std'] = gr[c].transform(np.std)
        df[f'{c}_nunique'] = gr[c].transform(nunique)

    df.drop(init_col, axis=1, inplace=True)
    utils.to_pickles(df, f'../data/102_{name}', utils.SPLIT_SIZE)
Esempio n. 13
0
del train
gc.collect()

# =============================================================================
# test
# =============================================================================
test = utils.read_pickles('../data/dtest')
#X_head = pd.read_pickle('X_head.p')

for c in categorical_feature:
    col = c
    filepath = f'../data/dtest_drop_{c}'

    categorical_feature_ = list(set(categorical_feature) - set([c]))

    system(f'rm -rf {filepath}')
    print(f'categorical_feature {categorical_feature_}')
    print(f'writing {filepath}...')
    utils.to_pickles(test.drop(col, axis=1), filepath, utils.SPLIT_SIZE)

    gc.collect()

del test
gc.collect()

#==============================================================================
system('touch SUCCESS_804')

utils.end(__file__)
Esempio n. 14
0
    gc.collect()


# =============================================================================
# concat pt1
# =============================================================================
gc.collect()
pool = Pool(proc)
callback = pool.map(multi, utils.comb[:10])
pool.close()

# train
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/111__*_train.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/111-1_train', utils.SPLIT_SIZE)

gc.collect()

# test
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/111__*_test.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/111-1_test', utils.SPLIT_SIZE)

os.system('rm -rf ../data/111__*.p')

# =============================================================================
# concat pt2
# =============================================================================
gc.collect()
Esempio n. 15
0
# log
#==============================================================================
log = pd.concat([
    pd.read_csv('../input/order_products__prior.csv.gz'),
    pd.read_csv('../input/order_products__train.csv.gz')
],
                ignore_index=1)

log.sort_values(['order_id', 'add_to_cart_order'], inplace=True)
log.reset_index(drop=1, inplace=True)
log = pd.merge(log, goods, on='product_id', how='left')
log = pd.merge(log, orders, on='order_id', how='left')
log['order_number_rev'] = log.groupby('user_id').order_number.transform(
    np.max) - log.order_number

utils.to_pickles(log, '../input/mk/log', 20)

gc.collect()
#==============================================================================
# order_tbl
#==============================================================================
order_product = log.groupby('order_id').product_name.apply(list).reset_index()
order_tbl = pd.merge(orders, order_product, on='order_id', how='left')

order_tbl.sort_values(['user_id', 'order_number'], inplace=True)
order_tbl.reset_index(drop=1, inplace=True)
order_tbl = pd.merge(order_tbl,
                     log[['order_id', 'order_number_rev']].drop_duplicates(),
                     on='order_id',
                     how='left')
order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int)
Esempio n. 16
0
def main(is_eval=False):
    # load csv
    if is_eval:
        df = pd.read_csv('../input/sales_train_evaluation.csv')
    else:
        df = pd.read_csv('../input/sales_train_validation.csv')

    sub = pd.read_csv('../input/sample_submission.csv')

    # split test data
    sub['is_test1'] = sub['id'].apply(lambda x: True
                                      if '_validation' in x else False)
    sub['is_test2'] = sub['id'].apply(lambda x: True
                                      if '_evaluation' in x else False)

    test1 = sub[sub['is_test1']]
    test2 = sub[sub['is_test2']]

    del sub
    gc.collect()

    # drop flags
    test1.drop(['is_test1', 'is_test2'], axis=1, inplace=True)
    test2.drop(['is_test1', 'is_test2'], axis=1, inplace=True)

    # change column name
    test1.columns = ['id'] + COLS_TEST1
    test2.columns = ['id'] + COLS_TEST2

    # change id
    test2['id'] = test2['id'].str.replace('_evaluation', '_validation')

    # merge
    if not is_eval:
        df = df.merge(test1, on='id', how='left')

    df = df.merge(test2, on='id', how='left')

    del test1, test2
    gc.collect()

    # reduce memory usage
    df = reduce_mem_usage(df)

    # date columns
    cols_date = [c for c in df.columns if 'd_' in c]

    # melt sales data
    print('Melting sales data...')
    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    df = pd.melt(df, id_vars=id_vars, var_name='d', value_name='demand')

    print('Melted sales train validation has {} rows and {} columns'.format(
        df.shape[0], df.shape[1]))

    # add numeric date
    df['d_numeric'] = df['d'].apply(lambda x: int(x[2:]))

    # drop old data (~2012/12/31)
    print('drop old data...')
    df = df[df['d_numeric'] >= 704]

    # drop christmas data
    print('drop christmas data...')
    df = df[df['d_numeric'] != 331]  # 2011-12-25
    df = df[df['d_numeric'] != 697]  # 2012-12-25
    df = df[df['d_numeric'] != 1062]  # 2013-12-25
    df = df[df['d_numeric'] != 1427]  # 2014-12-25
    df = df[df['d_numeric'] != 1792]  # 2015-12-25

    # add is zero flag
    df['is_zero'] = (df['demand'] == 0).astype(int)

    # save pkl
    to_pickles(df, '../feats/sales', split_size=3)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 17
0
end = 0
limit = 10
for pt in range(1, 10):
    end += limit
    print(st, end)
    gc.collect()
    pool = Pool(proc)
    callback = pool.map(multi, utils.comb[st:end])
    pool.close()
    st = end

    # train
    df = pd.concat(
        [pd.read_pickle(f) for f in sorted(glob('../data/110__*_train.p'))],
        axis=1).reset_index(drop=True)
    utils.to_pickles(df, '../data/110-{}_train'.format(pt), utils.SPLIT_SIZE)

    del df
    gc.collect()

    # test
    df = pd.concat(
        [pd.read_pickle(f) for f in sorted(glob('../data/110__*_test.p'))],
        axis=1).reset_index(drop=True)
    utils.to_pickles(df, '../data/110-{}_test'.format(pt), utils.SPLIT_SIZE)

    os.system('rm -rf ../data/110__*.p')

    if end >= len(utils.comb):
        break
Esempio n. 18
0
sub = utils.read_pickles('../data/test_old', ['click_id'])

load_folders = sorted(glob('../data/*_test/')) + ['../data/test_old/']
args = list(zip(load_folders, range(len(load_folders))))

pool = Pool(14)
pool.map(multi_test, args)
pool.close()

print('concat test')
load_files = sorted(glob('../data/804_tmp*.p'))
X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1)
print('test.shape should be 18790469:', X[X_head.columns].shape)
print('X.isnull().sum().sum():', X.isnull().sum().sum())

utils.to_pickles(X[X_head.columns], '../data/dtest', 10)

del X; gc.collect()


sub = sub[~sub.click_id.isnull()].reset_index(drop=True)
sub.drop_duplicates('click_id', keep='last', inplace=True) # last?
sub['click_id'] = sub['click_id'].map(int)
sub.reset_index(drop=True, inplace=True)

sub.to_pickle('../data/sub.p')
system('rm ../data/804_tmp*.p')

system('touch SUCCESS_804')

#==============================================================================
pool.map(multi_train, args)
pool.close()

print('concat train')
load_files = sorted(glob('../data/805_tmp*.p'))
X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1)
print('X.isnull().sum().sum():', X.isnull().sum().sum())

system('rm ../data/dtrain.mt')
system('rm -rf ../data/dtrain')

y = utils.read_pickles('../data/is_attributed')
lgb.Dataset(
    X, label=y,
    categorical_feature=categorical_feature).save_binary('../data/dtrain.mt')
utils.to_pickles(X, '../data/dtrain', utils.SPLIT_SIZE)

X_head = X.head()
X_head.to_pickle('X_head.p')

del X, y
gc.collect()
system('rm ../data/805_tmp*.p')
"""

X_head = pd.read_pickle('X_head.p')

"""

# =============================================================================
# # test
args = list(zip(load_folders, range(len(load_folders))))

pool = Pool(10)
pool.map(multi_train, args)
pool.close()

print('concat train')
load_files = sorted(glob('../data/803_tmp*.p'))
X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1)
print('X.isnull().sum().sum():', X.isnull().sum().sum())

system('rm ../data/dtrain_429-2.mt')

lgb.Dataset(X.drop('is_attributed', axis=1), label=X.is_attributed,
            categorical_feature=categorical_feature).save_binary('../data/dtrain_429-2.mt')
utils.to_pickles(X, '../data/dtrain_429-2', utils.SPLIT_SIZE)

X_head = X.head().drop('is_attributed', axis=1)
X_head.to_pickle('X_head_429-2.p')

del X; gc.collect()
system('rm ../data/803_tmp*.p')

"""

X_head = pd.read_pickle('X_head_429-2.p')

"""

# =============================================================================
# # test
Esempio n. 21
0
def concat_pred_None(T, W, dryrun=False):
    if T==-1:
        name = 'test'
    else:
        name = 'trainT-'+str(T)
    #==============================================================================
    print('load label')
    #==============================================================================
    # NOTE: order_id is label
    print('load t{}'.format(W))
    X_base = pd.read_pickle('../feature/X_base_t{}.p'.format(W))
    
    label = pd.read_pickle('../input/mk/order_None.p').rename(columns={'is_None':'y'})
    order_tag = pd.read_pickle('../feature/{}/label_reordered.p'.format(name)).order_id.unique()
    label = label[label.order_id.isin(order_tag)].reset_index(drop=True)
    
    # 'inner' for removing t-n_order_id == NaN
    if 'train' in name:
        df = pd.merge(X_base[X_base.is_train==1], label[['order_id', 'y']], on='order_id', how='inner')
    elif name == 'test':
        df = X_base[X_base.is_train==0]
    
    if dryrun:
        print('dryrun')
        df = df.sample(9999)
    
    print('{}.shape:{}\n'.format(name, df.shape))
        
    #==============================================================================
    print('user feature')
    #==============================================================================
    
    df = user_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('item')
    #==============================================================================
    def compress(df, key):
        """
        key: str
        """
        df_ = df.drop_duplicates(key)[[key]].set_index(key)
        dtypes = df.dtypes
        col = dtypes[dtypes!='O'].index
        col = [c for c in col if '_id' not in c]
        gr = df.groupby(key)
        for c in col:
            df_[c+'-min'] = gr[c].min()
            df_[c+'-mean'] = gr[c].mean()
            df_[c+'-median'] = gr[c].median()
            df_[c+'-max'] = gr[c].max()
            df_[c+'-std'] = gr[c].std()
            
        var = df_.var()
        col = var[var==0].index
        df_.drop(col, axis=1, inplace=True)
        gc.collect()
        return df_.reset_index()
    
    order_prod = pd.read_pickle('../feature/{}/label_reordered.p'.format(name))
    order_prod = pd.merge(df[['order_id', 'order_hour_of_day', 'order_dow', 'timezone']], 
                          order_prod[['order_id', 'product_id']], 
                          how='left', on='order_id')
    
    order_prod = item_feature(order_prod, name)
    order_prod.drop(['order_hour_of_day', 'order_dow', 'timezone', 'product_id'], axis=1, inplace=True)
    
    key = 'order_id'
    feature = compress(order_prod, key)
    df = pd.merge(df, feature, on=key, how='left')
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('user x item')
    #==============================================================================
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    gc.collect()
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('daytime')
    #==============================================================================
    
    df = daytime_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
#    #==============================================================================
#    print('aisle')
#    #==============================================================================
#    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
#    col = [c for c in order_aisdep.columns if 'department_' in c]
#    order_aisdep.drop(col, axis=1, inplace=1)
#    
#    df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left')
#    df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left')
#    df = pd.merge(df, order_aisdep.add_prefix('t-3_'), on='t-3_order_id', how='left')
#    
#    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('department')
    #==============================================================================
    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
    col = [c for c in order_aisdep.columns if 'aisle_' in c]
    order_aisdep.drop(col, axis=1, inplace=1)
    
    for t in range(1, W+1):
        df = pd.merge(df, order_aisdep.add_prefix('t-{}_'.format(t)), 
                    on='t-{}_order_id'.format(t), how='left')
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('department cumsum')
    #==============================================================================
    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department_cumsum.p')
    col = [c for c in order_aisdep.columns if 'aisle_' in c]
    order_aisdep.drop(col, axis=1, inplace=1)
    
    df = pd.merge(df, order_aisdep.add_prefix('t-{}_'.format(1)), 
                  on='t-{}_order_id'.format(1), how='left')
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['timezone'])
    df = pd.get_dummies(df, columns=['order_dow'])
    df = pd.get_dummies(df, columns=['order_hour_of_day'])
    
    df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len']
    df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len']
    df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len']
    
    df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len']
    df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len']
    df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len']
    
    df['T'] = T
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    
    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df, '../feature/{}/all_None_w{}'.format(name, W), 20, inplace=True)
Esempio n. 22
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 19 14:18:40 2018

@author: kazuki.onodera
"""

import utils
utils.start(__file__)

y = utils.load_train()[['answer_score']]

utils.to_pickles(y, f'../data/label', utils.SPLIT_SIZE)



#==============================================================================
utils.end(__file__)
def multi_gr2(k):
    gr2 = prev.groupby([KEY, k])
    gc.collect()
    print(k)
    keyname = 'gby-' + '-'.join([KEY, k])
    # size
    gr1 = gr2.size().groupby(KEY)
    name = f'{PREF}_{keyname}_size'
    base[f'{name}_min'] = gr1.min()
    base[f'{name}_max'] = gr1.max()
    base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min']
    base[f'{name}_mean'] = gr1.mean()
    base[f'{name}_std'] = gr1.std()
    base[f'{name}_sum'] = gr1.sum()
    base[f'{name}_nunique'] = gr1.size()
    for v in col_num:

        # min
        gr1 = gr2[v].min().groupby(KEY)
        name = f'{PREF}_{keyname}_{v}_min'
        base[f'{name}_max'] = gr1.max()
        base[f'{name}_mean'] = gr1.mean()
        base[f'{name}_std'] = gr1.std()
        base[f'{name}_sum'] = gr1.sum()
        base[f'{name}_nunique'] = gr1.apply(nunique)

        # max
        gr1 = gr2[v].max().groupby(KEY)
        name = f'{PREF}_{keyname}_{v}_max'
        base[f'{name}_min'] = gr1.min()
        base[f'{name}_mean'] = gr1.mean()
        base[f'{name}_std'] = gr1.std()
        base[f'{name}_sum'] = gr1.sum()
        base[f'{name}_nunique'] = gr1.apply(nunique)

        # mean
        gr1 = gr2[v].mean().groupby(KEY)
        name = f'{PREF}_{keyname}_{v}_mean'
        base[f'{name}_min'] = gr1.min()
        base[f'{name}_max'] = gr1.max()
        base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min']
        base[f'{name}_mean'] = gr1.mean()
        base[f'{name}_std'] = gr1.std()
        base[f'{name}_sum'] = gr1.sum()
        base[f'{name}_nunique'] = gr1.apply(nunique)

        # std
        gr1 = gr2[v].std().groupby(KEY)
        name = f'{PREF}_{keyname}_{v}_std'
        base[f'{name}_min'] = gr1.min()
        base[f'{name}_max'] = gr1.max()
        base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min']
        base[f'{name}_mean'] = gr1.mean()
        base[f'{name}_std'] = gr1.std()
        base[f'{name}_sum'] = gr1.sum()
        base[f'{name}_nunique'] = gr1.apply(nunique)

        # sum
        gr1 = gr2[v].sum().groupby(KEY)
        name = f'{PREF}_{keyname}_{v}_sum'
        base[f'{name}_min'] = gr1.min()
        base[f'{name}_max'] = gr1.max()
        base[f'{name}_max-min'] = base[f'{name}_max'] - base[f'{name}_min']
        base[f'{name}_mean'] = gr1.mean()
        base[f'{name}_std'] = gr1.std()
        base[f'{name}_nunique'] = gr1.apply(nunique)

    base.reset_index(inplace=True)
    df = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/102_{k}_train', utils.SPLIT_SIZE)

    df = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_pickles(df, f'../data/102_{k}_test', utils.SPLIT_SIZE)
    print(f'finish {k}')
    return
Esempio n. 24
0

def remve_stop(s):
    s = [w.lower() for w in s.split()]
    s = [w for w in s if w not in stopwords]
    return s


def make_features(df):
    init_col = df.columns.tolist()

    #
    df['q_stop'] = df['question_text'].map(remve_stop)
    df['a_stop'] = df['answer_text'].map(remve_stop)

    df.drop(init_col, axis=1, inplace=True)


# =============================================================================
# main
# =============================================================================

make_features(train)
make_features(test)

utils.to_pickles(train, '../data/train_stop', utils.SPLIT_SIZE)
utils.to_pickles(test, '../data/test_stop', utils.SPLIT_SIZE)

#==============================================================================
utils.end(__file__)
# =============================================================================
st = 0
end = 0
limit = 10
for pt in range(1, 10):
    end +=limit
    print(st, end)
    gc.collect()
    pool = Pool(proc)
    callback = pool.map(multi, comb[st:end])
    pool.close()
    st = end
    
    # train
    df = pd.concat([pd.read_pickle(f) for f in sorted(glob('../data/109__*_train.p'))], axis=1).reset_index(drop=True)
    utils.to_pickles(df, '../data/109-{}_train'.format(pt), 10)
    
    gc.collect()
    
    # test
    df = pd.concat([pd.read_pickle(f) for f in sorted(glob('../data/109__*_test.p'))], axis=1).reset_index(drop=True)
    utils.to_pickles(df, '../data/109-{}_test'.format(pt), 10)
    
    os.system('rm -rf ../data/109__*.p')


#==============================================================================
utils.end(__file__)


Esempio n. 26
0
def concat_pred_item(T, dryrun=False):
    if T == -1:
        name = 'test'
    else:
        name = 'trainT-' + str(T)
    #==============================================================================
    print('load label')
    #==============================================================================
    # NOTE: order_id is label
    print('load t3')
    X_base = pd.read_pickle('../feature/X_base_t3.p')

    label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name))

    # 'inner' for removing t-n_order_id == NaN
    if 'train' in name:
        df = pd.merge(X_base[X_base.is_train == 1],
                      label,
                      on='order_id',
                      how='inner')
    elif name == 'test':
        df = pd.merge(X_base[X_base.is_train == 0],
                      label,
                      on='order_id',
                      how='inner')

    if dryrun:
        print('dryrun')
        df = df.sample(9999)

    df = pd.merge(df,
                  pd.read_pickle('../input/mk/goods.p')[[
                      'product_id', 'aisle_id', 'department_id'
                  ]],
                  on='product_id',
                  how='left')

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('user feature')
    #==============================================================================

    df = user_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('item feature')
    #==============================================================================

    df = item_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]

    #==============================================================================
    print('user x item')
    #==============================================================================

    df = user_item_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('user x item')

    #==============================================================================
    def compress(df, key):
        """
        key: str
        """
        df_ = df.drop_duplicates(key)[[key]].set_index(key)
        dtypes = df.dtypes
        col = dtypes[dtypes != 'O'].index
        col = [c for c in col if '_id' not in c]
        gr = df.groupby(key)
        for c in col:
            df_[c + '-min'] = gr[c].min()
            df_[c + '-mean'] = gr[c].mean()
            df_[c + '-median'] = gr[c].median()
            df_[c + '-max'] = gr[c].max()
            df_[c + '-std'] = gr[c].std()

        var = df_.var()
        col = var[var == 0].index
        df_.drop(col, axis=1, inplace=True)
        gc.collect()

        return df_.reset_index()

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle(
            '../feature/{}/f307_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle(
            '../feature/{}/f308_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(
        pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'user_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)),
        key)
    df = pd.merge(df, feature, on=key, how='left')

    key = 'order_id'
    feature = compress(
        pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')

    gc.collect()

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]

    #==============================================================================
    print('daytime')
    #==============================================================================

    df = daytime_feature(df, name)

    print('{}.shape:{}\n'.format(name, df.shape))

    #    #==============================================================================
    #    print('aisle')
    #    #==============================================================================
    #    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
    #    col = [c for c in order_aisdep.columns if 'department_' in c]
    #    order_aisdep.drop(col, axis=1, inplace=1)
    #
    #    df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left')
    #    df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left')
    #
    #    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['timezone'])
    df = pd.get_dummies(df, columns=['order_dow'])
    df = pd.get_dummies(df, columns=['order_hour_of_day'])

    df['days_near_order_cycle'] = (df.days_since_last_order_this_item -
                                   df.item_order_days_mean).abs()
    df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min
    df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max

    df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart)

    df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df[
        't-2_product_unq_len']
    df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df[
        't-3_product_unq_len']
    df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df[
        't-3_product_unq_len']

    df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df[
        't-2_product_unq_len']
    df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df[
        't-3_product_unq_len']
    df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df[
        't-3_product_unq_len']

    df['T'] = T

    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)

    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df,
                         '../feature/{}/all'.format(name),
                         20,
                         inplace=True)
Esempio n. 27
0
goods.to_pickle('../input/mk/goods.p')
gc.collect()
#==============================================================================
# log
#==============================================================================
log = pd.concat([pd.read_csv('../input/order_products__prior.csv.gz'), 
                 pd.read_csv('../input/order_products__train.csv.gz')], 
                ignore_index=1)

log.sort_values(['order_id', 'add_to_cart_order'], inplace=True)
log.reset_index(drop=1, inplace=True)
log = pd.merge(log, goods, on='product_id', how='left')
log = pd.merge(log, orders, on='order_id', how='left')
log['order_number_rev'] = log.groupby('user_id').order_number.transform(np.max) - log.order_number

utils.to_pickles(log, '../input/mk/log', 20)

gc.collect()
#==============================================================================
# order_tbl
#==============================================================================
order_product = log.groupby('order_id').product_name.apply(list).reset_index()
order_tbl = pd.merge(orders, order_product, on='order_id', how='left')

order_tbl.sort_values(['user_id', 'order_number'],inplace=True)
order_tbl.reset_index(drop=1, inplace=True)
order_tbl = pd.merge(order_tbl, log[['order_id','order_number_rev']].drop_duplicates(), on='order_id', how='left')
order_tbl.order_number_rev = order_tbl.order_number_rev.fillna(-1).astype(int)
#order_tbl['order_number_rev'] = order_tbl.groupby('user_id').order_number.transform(np.max) - order_tbl.order_number
order_tbl['days_since_first_order'] = order_tbl.groupby('user_id').days_since_prior_order.cumsum()
Esempio n. 28
0
# =============================================================================

folders = ['../feature_bureau', '../feature_bureau_unused']

for fol in folders:
    os.system(f'rm -rf {fol}')
    os.system(f'mkdir {fol}')

train = utils.load_train(['SK_ID_CURR', 'TARGET'])
test = utils.load_test(['SK_ID_CURR'])

bureau = utils.read_pickles('../data/bureau')

bureau_train = pd.merge(bureau, train, on='SK_ID_CURR', how='inner')
bureau_test = pd.merge(bureau, test, on='SK_ID_CURR', how='inner')

utils.to_pickles(bureau_train, '../data/bureau_train', utils.SPLIT_SIZE)
utils.to_pickles(bureau_test, '../data/bureau_test', utils.SPLIT_SIZE)

utils.to_pickles(bureau_train[['TARGET']], '../data/bureau_label',
                 utils.SPLIT_SIZE)
"""

bureau_train = utils.read_pickles('../data/bureau_train')
bureau_test  = utils.read_pickles('../data/bureau_test')

"""

#==============================================================================
utils.end(__file__)

pool = Pool(nthread)
callback = pool.map(multi, utils.comb)
pool.close()

del trte
gc.collect()

# =============================================================================
# concat
# =============================================================================

# train
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/005__*_train.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/005_train', 10)
print(df.columns.tolist())

# test
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/005__*_test.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/005_test', 10)

os.system('rm -rf ../data/005__*.p')

#==============================================================================
utils.end(__file__)
        '../data/002__{}_test.p'.format(count_keys_))


pool = Pool(nthread)
callback = pool.map(multi, utils.comb)
pool.close()

del trte
gc.collect()

# =============================================================================
# concat
# =============================================================================

# train
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/002__*_train.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/002_train', utils.SPLIT_SIZE)

# test
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob('../data/002__*_test.p'))],
    axis=1).reset_index(drop=True)
utils.to_pickles(df, '../data/002_test', utils.SPLIT_SIZE)

os.system('rm -rf ../data/002__*.p')

#==============================================================================
utils.end(__file__)
Esempio n. 31
0
# gr2
# =============================================================================
pool = Pool(NTHREAD)
callback = pool.map(multi_gr2, col_group)
pool.close()

# =============================================================================
# merge
# =============================================================================
df = pd.concat(
    [pd.read_pickle(f) for f in sorted(glob(f'../data/tmp_202_{PREF}*.p'))],
    axis=1)
base = pd.concat([base, df], axis=1)
base.reset_index(inplace=True)
del df
gc.collect()

train = utils.load_train([KEY])
train = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)

test = utils.load_test([KEY])
test = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)

utils.to_pickles(train, '../data/202_train', utils.SPLIT_SIZE)
utils.to_pickles(test, '../data/202_test', utils.SPLIT_SIZE)

os.system('rm ../data/tmp_202_*.p')

#==============================================================================
utils.end(__file__)
Esempio n. 32
0
def concat_pred_item(T, dryrun=False):
    if T==-1:
        name = 'test'
    else:
        name = 'trainT-'+str(T)
    #==============================================================================
    print('load label')
    #==============================================================================
    # NOTE: order_id is label
    print('load t3')
    X_base = pd.read_pickle('../feature/X_base_t3.p')
    
    label = pd.read_pickle('../feature/{}/label_reordered.p'.format(name))
    
    # 'inner' for removing t-n_order_id == NaN
    if 'train' in name:
        df = pd.merge(X_base[X_base.is_train==1], label, on='order_id', how='inner')
    elif name == 'test':
        df = pd.merge(X_base[X_base.is_train==0], label, on='order_id', how='inner')
    
    if dryrun:
        print('dryrun')
        df = df.sample(9999)
    
    df = pd.merge(df, pd.read_pickle('../input/mk/goods.p')[['product_id', 'aisle_id', 'department_id']], 
                  on='product_id', how='left')
    
    print('{}.shape:{}\n'.format(name, df.shape))
        
    #==============================================================================
    print('user feature')
    #==============================================================================
    
    df = user_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('item feature')
    #==============================================================================
    
    df = item_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('user x item')
    #==============================================================================
    
    df = user_item_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
    #==============================================================================
    print('user x item')
    #==============================================================================
    def compress(df, key):
        """
        key: str
        """
        df_ = df.drop_duplicates(key)[[key]].set_index(key)
        dtypes = df.dtypes
        col = dtypes[dtypes!='O'].index
        col = [c for c in col if '_id' not in c]
        gr = df.groupby(key)
        for c in col:
            df_[c+'-min'] = gr[c].min()
            df_[c+'-mean'] = gr[c].mean()
            df_[c+'-median'] = gr[c].median()
            df_[c+'-max'] = gr[c].max()
            df_[c+'-std'] = gr[c].std()
            
        var = df_.var()
        col = var[var==0].index
        df_.drop(col, axis=1, inplace=True)
        gc.collect()
        
        return df_.reset_index()
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f301_order-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f302_order-product_all.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f303_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f304-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f305_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f306_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f307_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-timezone.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f308_user-product-dow.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f309_user-product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f310_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    feature = compress(pd.read_pickle('../feature/{}/f312_user_product_n5.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    gc.collect()
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_aisle.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f313_user_dep.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'user_id'
    feature = compress(pd.read_pickle('../feature/{}/f314_user-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-1_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-2_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f315-3_order-product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    key = 'order_id'
    feature = compress(pd.read_pickle('../feature/{}/f316_order_product.p'.format(name)), key)
    df = pd.merge(df, feature, on=key, how='left')
    
    gc.collect()
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    ix_end = df.shape[1]
    
    #==============================================================================
    print('daytime')
    #==============================================================================
    
    df = daytime_feature(df, name)
    
    print('{}.shape:{}\n'.format(name, df.shape))
    
#    #==============================================================================
#    print('aisle')
#    #==============================================================================
#    order_aisdep = pd.read_pickle('../input/mk/order_aisle-department.p')
#    col = [c for c in order_aisdep.columns if 'department_' in c]
#    order_aisdep.drop(col, axis=1, inplace=1)
#    
#    df = pd.merge(df, order_aisdep.add_prefix('t-1_'), on='t-1_order_id', how='left')
#    df = pd.merge(df, order_aisdep.add_prefix('t-2_'), on='t-2_order_id', how='left')
#    
#    print('{}.shape:{}\n'.format(name, df.shape))

    #==============================================================================
    print('feature engineering')
    #==============================================================================
    df = pd.get_dummies(df, columns=['timezone'])
    df = pd.get_dummies(df, columns=['order_dow'])
    df = pd.get_dummies(df, columns=['order_hour_of_day'])
    
    df['days_near_order_cycle'] = (df.days_since_last_order_this_item - df.item_order_days_mean).abs()
    df['days_last_order-min'] = df.days_since_last_order_this_item - df.useritem_order_days_min
    df['days_last_order-max'] = df.days_since_last_order_this_item - df.useritem_order_days_max
    
    df['pos_cart_diff'] = (df.item_mean_pos_cart - df.useritem_mean_pos_cart)
    
    df['t-1_product_unq_len_diffByT-2'] = df['t-1_product_unq_len'] - df['t-2_product_unq_len']
    df['t-1_product_unq_len_diffByT-3'] = df['t-1_product_unq_len'] - df['t-3_product_unq_len']
    df['t-2_product_unq_len_diffByT-3'] = df['t-2_product_unq_len'] - df['t-3_product_unq_len']
    
    df['t-1_product_unq_len_ratioByT-2'] = df['t-1_product_unq_len'] / df['t-2_product_unq_len']
    df['t-1_product_unq_len_ratioByT-3'] = df['t-1_product_unq_len'] / df['t-3_product_unq_len']
    df['t-2_product_unq_len_ratioByT-3'] = df['t-2_product_unq_len'] / df['t-3_product_unq_len']
    
    df['T'] = T
    
    #==============================================================================
    print('reduce memory')
    #==============================================================================
    utils.reduce_memory(df, ix_end)
    
    #==============================================================================
    print('output')
    #==============================================================================
    if dryrun == True:
        return df
    else:
        utils.to_pickles(df, '../feature/{}/all'.format(name), 20, inplace=True)