Esempio n. 1
0
def prep_model_data(cons):
    """
    
    Prepare Model Data Documentation
    
    Function Overview
    
    This function prepares the data for modelling.
    This includes creating shift / lagged attributes for various shops, items and revenue.
    Delta attributes are also created.
    
    Defaults
    
    prep_model_data(cons)
    
    Parameters
    
    cons - Python Module, the programme constants for the competition
    
    Returns
    
    0 for successful execution
    
    Example
    
    prep_model_data(cons = cons)
    
    """

    print('loading in base data ...')

    # load in the bases data file
    base_agg_comp = pd.read_feather(cons.base_agg_shft_fpath)

    # set function inputs for item count shift attributes
    index_shift = ['date_block_num']
    columns_shift = ['shop_id', 'item_id']
    lags = [1, 2, 3, 4]
    fill_na = 0

    ########################
    #-- Shift Attributes --#
    ########################

    print('Running shift attributes for item cnt ...')

    #-- Lag Item Cnt Shifts --#

    print('item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['item_cnt_day'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=lags,
                                   fill_value=fill_na)

    #-- Lag Shop Total Shifts --#

    print('shop_id_total_item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['shop_id_total_item_cnt_day'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=lags,
                                   fill_value=fill_na)

    #--Lag Item Total Shifts --#

    print('item_id_total_item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['item_id_total_item_cnt_day'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=lags,
                                   fill_value=fill_na)

    #-- Lag Item Price --#

    print('item_price')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['item_price'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=[1],
                                   fill_value=fill_na)

    #-- Lag Revenue --#

    print('revenue')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['revenue'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=[1],
                                   fill_value=fill_na)

    #-- Lag Shop id Cat id Total --#

    print('item_category_id_total_item_cnt_day')

    # create shift attributes

    base_agg_comp = gen_shift_attr(
        dataset=base_agg_comp,
        values=['item_category_id_total_item_cnt_day'],
        index=index_shift,
        columns=columns_shift,
        lags=[1],
        fill_value=fill_na)

    print('shop_id_item_category_id_total_item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(
        dataset=base_agg_comp,
        values=['shop_id_item_category_id_total_item_cnt_day'],
        index=index_shift,
        columns=columns_shift,
        lags=[1],
        fill_value=fill_na)

    print('city_enc_total_item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(dataset=base_agg_comp,
                                   values=['city_enc_total_item_cnt_day'],
                                   index=index_shift,
                                   columns=columns_shift,
                                   lags=[1],
                                   fill_value=fill_na)

    print('item_id_city_enc_total_item_cnt_day')

    # create shift attributes
    base_agg_comp = gen_shift_attr(
        dataset=base_agg_comp,
        values=['item_id_city_enc_total_item_cnt_day'],
        index=index_shift,
        columns=columns_shift,
        lags=[1],
        fill_value=fill_na)

    #table.apply(lambda x: x.rank(axis = 0, method = 'first'), axis = 1)

    print('Removing 1st year of data due to lagged attributes ...')

    filt_1st_year = base_agg_comp['date_block_num'] >= 4
    base_agg_comp = base_agg_comp[filt_1st_year]
    shape = base_agg_comp.shape

    print('Create delta attributes ...')

    # TODO: add delta revenue
    base_agg_comp['delta_item_price'] = base_agg_comp[
        'item_price'] - base_agg_comp['item_price_shift_1']
    base_agg_comp['delta_item_cnt_day_1_2'] = base_agg_comp[
        'item_cnt_day_shift_1'] - base_agg_comp['item_cnt_day_shift_2']
    base_agg_comp['delta_item_cnt_day_2_3'] = base_agg_comp[
        'item_cnt_day_shift_2'] - base_agg_comp['item_cnt_day_shift_3']
    base_agg_comp['delta_item_cnt_day_3_4'] = base_agg_comp[
        'item_cnt_day_shift_3'] - base_agg_comp['item_cnt_day_shift_4']

    print('Create proportion attributes ...')

    base_agg_comp[
        'item_cnt_day_shift_1_div_shop_id_total_item_cnt_day_shift_1'] = (
            base_agg_comp['item_cnt_day_shift_1'] /
            base_agg_comp['shop_id_total_item_cnt_day_shift_1']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_2_div_shop_id_total_item_cnt_day_shift_2'] = (
            base_agg_comp['item_cnt_day_shift_2'] /
            base_agg_comp['shop_id_total_item_cnt_day_shift_2']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_3_div_shop_id_total_item_cnt_day_shift_3'] = (
            base_agg_comp['item_cnt_day_shift_3'] /
            base_agg_comp['shop_id_total_item_cnt_day_shift_3']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_4_div_shop_id_total_item_cnt_day_shift_4'] = (
            base_agg_comp['item_cnt_day_shift_4'] /
            base_agg_comp['shop_id_total_item_cnt_day_shift_4']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_1_div_item_id_total_item_cnt_day_shift_1'] = (
            base_agg_comp['item_cnt_day_shift_1'] /
            base_agg_comp['item_id_total_item_cnt_day_shift_1']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_2_div_item_id_total_item_cnt_day_shift_2'] = (
            base_agg_comp['item_cnt_day_shift_2'] /
            base_agg_comp['item_id_total_item_cnt_day_shift_2']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_3_div_item_id_total_item_cnt_day_shift_3'] = (
            base_agg_comp['item_cnt_day_shift_3'] /
            base_agg_comp['item_id_total_item_cnt_day_shift_3']).fillna(0)
    base_agg_comp[
        'item_cnt_day_shift_4_div_item_id_total_item_cnt_day_shift_4'] = (
            base_agg_comp['item_cnt_day_shift_4'] /
            base_agg_comp['item_id_total_item_cnt_day_shift_4']).fillna(0)

    print('Mean encoding data ...')

    base_agg_comp['date_block_num_mean_enc'] = mean_encode(
        dataset=base_agg_comp, attr=['date_block_num'], tar='item_cnt_day')
    base_agg_comp['shop_id_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                    attr=['shop_id'],
                                                    tar='item_cnt_day')
    base_agg_comp['item_id_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                    attr=['item_id'],
                                                    tar='item_cnt_day')
    base_agg_comp['shop_item_id_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                         attr=['shop_item_id'],
                                                         tar='item_cnt_day')
    base_agg_comp['item_category_id_mean_enc'] = mean_encode(
        dataset=base_agg_comp, attr=['item_category_id'], tar='item_cnt_day')
    base_agg_comp['item_cat_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                     attr=['item_cat'],
                                                     tar='item_cnt_day')
    base_agg_comp['item_cat_sub_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                         attr=['item_cat_sub'],
                                                         tar='item_cnt_day')
    base_agg_comp['city_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                 attr=['city'],
                                                 tar='item_cnt_day')
    base_agg_comp['year_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                 attr=['year'],
                                                 tar='item_cnt_day')
    base_agg_comp['month_mean_enc'] = mean_encode(dataset=base_agg_comp,
                                                  attr=['month'],
                                                  tar='item_cnt_day')
    """
    print('Remove all items with no historic sell price from training set ...')
    
    # load in pickled holdout item shop id combination
    #holdout_shop_item_id_comb = pk.load(open(cons.holdout_shop_item_id_comb, 'rb'))
    base_agg_comp['shop_item_id'].nunique()
    
    # create filters for item price and holdout shop item combination
    price_tab = pd.pivot_table(data = base_agg_comp,
                               index = 'date_block_num',
                               columns = ['shop_id', 'item_id'],
                               values = 'item_price'
                               )
    all_zero_price = (price_tab == 0).all()
    keep_shop_item_comb = all_zero_price[all_zero_price].reset_index().drop(columns = 0)
    keep_shop_item_comb_series = keep_shop_item_comb['shop_id'].astype(str) + '_' + keep_shop_item_comb['item_id'].astype(str)
    
    filt_shop_item_id = base_agg_comp['shop_item_id'].isin(keep_shop_item_comb_series)
    filt_default_price = base_agg_comp['item_price'] == 0
    
    base_agg_comp = base_agg_comp[~filt_default_price | filt_shop_item_id]
    """

    print('Subsetting required columns ...')

    # set columns to drop
    data_cols = base_agg_comp.columns
    drop_cols = [
        'shop_item_id',
        'item_cat',
        'item_cat_sub',
        'city',
        'revenue',
        # dodgey attributes:
        'delta_item_price',
        'shop_id_item_id_months_last_rec',
        'item_price',
        'n_price_changes',
        'shop_id_item_id_months_first_rec'
    ]
    sub_cols = data_cols.drop(drop_cols)
    model_data = base_agg_comp[sub_cols]
    model_data = model_data.reset_index(drop=True)
    """
    print('Normalise data ...')
    
    ignore_cols = clean_cons.norm_ign_cols
    norm_cols = [col for col in model_data.columns if col not in ignore_cols]
    scaler = StandardScaler()
    scaler.fit(model_data[norm_cols])
    model_data[norm_cols] = scaler.transform(X = model_data[norm_cols])
    """

    print('Recasting data ...')

    model_data = recast_df(dataset=model_data)

    shape = model_data.shape

    print('outputting model data {} ....'.format(shape))

    # output the model data
    model_data.to_feather(cons.model_data_fpath)

    return 0
Esempio n. 2
0
def prep_raw_data(cons):
    
    """
    
    Prepare Raw Data Documentation
    
    Function Overview
    
    This function prepares and cleans each individual raw dataset.
    The prepped / cleaned raw dataset is output as a feather file for the next step in the pipeline.
    
    Defaults
    
    prep_raw_data(cons)
    
    Parameters
    
    cons - Python Module, programme constants for competition
    
    Returns
    
    0 for successful execution
    
    Outputs
    
    Example
    
    prep_raw_data()
    
    """
    
    # load in the raw data
    item_categories, items, sales_train, sample_submission, shops, test = load_files('raw', cons)
    
    #-- Sales Data --#
    
    print('Preparing Sales Data ...')
    
    print('Extracting date information ...')
    # prep sales train data
    sales_train['date'] = pd.to_datetime(sales_train['date'], format = '%d.%m.%Y')
    sales_train['day'] = sales_train['date'].dt.day
    sales_train['month'] = sales_train['date'].dt.month
    sales_train['year'] = sales_train['date'].dt.year
    
    print('Capping price and sales count ...')
    sales_train['item_cnt_day'] = sales_train['item_cnt_day'].apply(lambda x: x if x < 1000 else 1000)
    sales_train['item_price'] = sales_train['item_price'].apply(lambda x: x if x < 100000 else 100000)
    
    print('Extracting sales and refund information ...')
    # extract out the sales and refunds
    sale_lam = lambda x: 0 if x <= 0 else x
    ref_lam = lambda x: 0 if x >= 0 else -1 * x
    sales_train['n_refund'] = sales_train['item_cnt_day'].apply(ref_lam)
    sales_train['n_sale'] = sales_train['item_cnt_day'].apply(sale_lam)
    
    print('Fill negative item price with median ...')    
    neg_item_price = sales_train['item_price'] < 0
    sales_train.loc[neg_item_price, 'item_price'] = sales_train['item_price'].median()
    
    #-- Item Categories --#
    
    print('Preparing Item Data ...')
    
    print('Extracting category information ...')
    # extract the item category and sub-category
    item_categories['item_category_name'].value_counts()
    item_cat = item_categories['item_category_name'].str.split(' - ', expand = True)[0]
    item_cat_sub = item_categories['item_category_name'].str.split(' - ', expand = True)[1]
        
    # clean item categories
    item_cat = item_cat.str.replace('Payment.*', 'Payment Cards')
    item_categories['item_cat'] = item_cat.str.replace('.*[Gg]ames.*', 'Games')
        
    # clean item cat
    item_cat_sub = item_cat_sub.fillna('')
    item_cat_sub = item_cat_sub.str.replace('^Audiobooks.*', 'Audiobooks')
    item_cat_sub = item_cat_sub.str.replace('^CD.*', 'CD')
    item_cat_sub = item_cat_sub.str.replace('^Live.*', 'Live')
    item_categories['item_cat_sub'] = item_cat_sub.str.replace('^Teaching.*', 'Teaching')
    
    # label encode item cat
    item_cat_label_enc = preprocessing.LabelEncoder()
    item_cat_label_enc.fit(item_categories['item_cat'].unique())
    item_categories['item_cat_id'] = item_cat_label_enc.transform(item_categories['item_cat'])
    
    # label encode item cat sub
    item_cat_sub_label_enc = preprocessing.LabelEncoder()
    item_cat_sub_label_enc.fit(item_categories['item_cat_sub'].unique())
    item_categories['item_cat_sub_id'] = item_cat_sub_label_enc.transform(item_categories['item_cat_sub'])
    
    #-- Shop Name --#
    
    print('Preparing Shop Data ...')
    
    shop_filt = shops['shop_name'] == 'Сергиев Посад ТЦ "7Я"'
    shops.loc[shop_filt, 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
    shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
    city_filt = shops['city'] == '!Якутск'
    shops.loc[city_filt, 'city'] = 'Якутск'

    # label encode city
    city_cat_sub_label_enc = preprocessing.LabelEncoder()
    city_cat_sub_label_enc.fit(shops['city'].unique())
    shops['city_enc'] = city_cat_sub_label_enc.transform(shops['city'])
    
    #-- Downcase Data --#
    
    # recast data
    sample_submission = recast_df(dataset = sample_submission)
    items = recast_df(dataset = items)
    shops = recast_df(dataset = shops)
    item_categories = recast_df(dataset = item_categories)
    sales_train = recast_df(dataset = sales_train)
    test = recast_df(dataset = test)
    
    #-- Output Files --#
    
    print('Outputting cleaned raw data ...')
    
    # output the data
    sample_submission.to_feather(cons.sample_submission_clean_fpath)
    items.to_feather(cons.items_clean_fpath)
    shops.to_feather(cons.shops_clean_fpath)
    item_categories.to_feather(cons.item_categories_clean_fpath)
    sales_train.to_feather(cons.sales_train_clean_fpath)
    test.to_feather(cons.test_clean_fpath)
    
    return 0
Esempio n. 3
0
def agg_base_data(cons):
    """
    
    Aggregare Base Data Documentation
    
    Function Overview
    
    This function aggregates the raw data to a month, shop and item level, the same level as the test submission file
    Additional processing steps are taken, such as removing duplicate shop ids,calculating total revenue and recasting data types.
    
    Defaults
    
    agg_base_data(cons)
    
    Parameters
    
    cons - Python Module, the programme constants for the competition
    
    Return
    
    0 for successful execution
    
    Example
    
    agg_base_data(cons = cons)
    
    """

    print('Loading clean data ...')

    # load in the raw data
    item_categories, items, sales_train, sample_submission, shops, test = load_files(
        'clean', cons)

    print('aggregating base data ...')

    # want to aggregate to date_block_num, shop and product level
    sales_train = sales_train.sort_values(by=clean_cons.group_cols)
    agg_base = sales_train.groupby(clean_cons.group_cols,
                                   as_index=False).agg(clean_cons.agg_dict)

    # add ID column
    agg_base['ID'] = agg_base.index

    print('Create generalised test data ...')

    # create static columns
    test['year'] = 2015
    test['month'] = 11
    test['date_block_num'] = 34
    test['item_cnt_day'] = 0
    test['n_refund'] = 0
    test['n_sale'] = 0

    print('Pickling holdout shop_item_id combination...')

    test['shop_item_id'] = test['shop_id'].astype(
        str) + '_' + test['item_id'].astype(str)
    holdout_shop_item_id = test['shop_item_id'].unique()
    pk.dump(holdout_shop_item_id, open(cons.holdout_shop_item_id_comb, "wb"))

    print('Getting most recent sale price ...')

    # Generate most recent item price for test set
    recent_price = gen_most_recent_item_price(dataset=agg_base)
    join_cols = ['item_id']
    base_test_price = test.merge(recent_price, on=join_cols, how='left')

    # Fill in -999 default for missing prices
    base_test_price['item_price'] = base_test_price['item_price'].fillna(-999)

    print('Concatenate Base and Test data ...')

    base_concat = pd.concat(objs=[agg_base, base_test_price],
                            axis=0,
                            ignore_index=True)

    # note this impacts shop item id combination
    print('Removing duplicate shops ...')

    filt_shop_0 = base_concat['shop_id'] == 0
    filt_shop_1 = base_concat['shop_id'] == 1
    filt_shop_10 = base_concat['shop_id'] == 10

    base_concat.loc[filt_shop_0, 'shop_id'] = 57
    base_concat.loc[filt_shop_1, 'shop_id'] = 58
    base_concat.loc[filt_shop_10, 'shop_id'] = 11

    print('Calculate revenue ...')

    base_concat[
        'revenue'] = base_concat['item_price'] * base_concat['item_cnt_day']

    print('Clip item count day totals to [0, 20] interval ...')

    base_concat['item_cnt_day'] = base_concat['item_cnt_day'].apply(
        lambda x: 0 if x < 0 else (20 if x >= 20 else x))

    # data shape
    shape = base_concat.shape

    print('Recast data ...')

    base_concat = recast_df(dataset=base_concat)

    print('outputting aggregated base data {} ...'.format(shape))

    # output aggreated base data as feather file
    base_concat.to_feather(cons.base_agg_data_fpath)

    return 0
Esempio n. 4
0
def back_fill_missing_items(cons):
    """
    
    Back Fill Missing Items Documentation
    
    Function Overview
    
    This function back fills items found in the holdout set that are not found in the train, valid and test sets
    Additional process also include creating a primary key, creating indicators for missing data and irrelevant obserations.
    
    Defaults
    
    back_fill_missing_items(cons)
    
    Parameters
    
    cons - Python Module, the programme constants for the competition
    
    Returns
    
    0 for successful execution
    
    Example
    
    back_fill_missing_items(cons)
    
    """

    print('working on item sells and price ...')

    # load in the raw data
    item_categories, items, sales_train, sample_submission, shops, test = load_files(
        'clean', cons)

    agg_base = pd.read_feather(cons.base_agg_data_fpath)

    del sales_train, sample_submission, test

    #-- Price --#

    price_unstack = backfill_attr(dataset=agg_base,
                                  pivot_values=['item_price'],
                                  fillna=-999,
                                  pivot_index=['date_block_num'],
                                  pivot_columns=['shop_id', 'item_id'],
                                  ffill=True)

    #-- Monthly Sales --#

    total_unstack = backfill_attr(dataset=agg_base,
                                  pivot_values=['item_cnt_day'],
                                  fillna=0,
                                  pivot_index=['date_block_num'],
                                  pivot_columns=['shop_id', 'item_id'],
                                  ffill=False)

    #-- Renenue --#

    revenue_unstack = backfill_attr(dataset=agg_base,
                                    pivot_values=['revenue'],
                                    fillna=0,
                                    pivot_index=['date_block_num'],
                                    pivot_columns=['shop_id', 'item_id'],
                                    ffill=False)

    #-- ID --#

    print('Subsetting a ID column ...')

    sub_cols = ['shop_id', 'item_id', 'date_block_num', 'ID']
    id_df = agg_base[sub_cols]

    del agg_base

    #-- Join Datasets --#

    print('Joining datasets ...')

    # create an empty df to join on to
    join_cols = ['date_block_num', 'shop_id', 'item_id']
    join_df = price_unstack[join_cols]
    join_df = join_df.merge(price_unstack, on=join_cols, how='left')
    join_df = join_df.merge(total_unstack, on=join_cols, how='left')
    join_df = join_df.merge(revenue_unstack, on=join_cols, how='left')
    join_df = join_df.merge(id_df, on=join_cols, how='left')

    #del price_unstack, total_unstack, refund_unstack, sales_unstack, id_df
    del price_unstack, total_unstack, id_df

    print('Adding data set splits ...')

    join_df['data_split'] = join_df['date_block_num'].apply(
        lambda x: 'train' if x <= 33 else 'holdout')
    join_df['meta_level'] = join_df['date_block_num'].apply(
        lambda x: 'level_1'
        if x <= 29 else ('level_2' if x >= 30 and x <= 33 else 'holdout'))

    print('Filling in ID ...')

    join_df = fill_id(dataset=join_df, fill_type='range', split='train')
    join_df = fill_id(dataset=join_df, fill_type='range', split='valid')
    join_df = fill_id(dataset=join_df, fill_type='range', split='test')
    join_df = fill_id(dataset=join_df,
                      fill_type='value',
                      split='holdout',
                      fillna=-999)

    print('Create primary key ...')

    join_df['primary_key'] = join_df.index

    print('Create holdout subset indicator ...')

    filt_holdout = join_df['data_split'] == 'holdout'
    filt_id = join_df['ID'] != -999
    join_df['holdout_subset_ind'] = (filt_id & filt_holdout).astype(int)

    print('Mapping missing holdout sales info ...')

    join_df.loc[filt_holdout, 'item_cnt_day'] = 0

    print('Create no sales history indicator ...')

    filt_default_price = join_df['item_price'] == -999
    join_df['no_sales_hist_ind'] = filt_default_price.astype(int)

    print('Create no sales history holdout set indicator ...')

    filt_no_sales_holdout = filt_default_price & filt_holdout
    join_df['no_holdout_sales_hist_ind'] = filt_no_sales_holdout.astype(int)
    join_df['item_price'] = join_df['item_price'].replace(-999, 0)

    print('Removing observations not in holdout set ...')

    # NOTE: this step drops a lot of information
    # need to filter out excess items not found n holdout set
    # ideally this should save our runtime resources
    join_df['shop_item_id'] = join_df['shop_id'].astype(
        str) + '_' + join_df['item_id'].astype(str)
    holdout = join_df[join_df['data_split'] == 'holdout']
    id_null = holdout['ID'] == -999
    null_holdout = holdout[id_null]
    shop_item_id = null_holdout['shop_item_id'].unique()
    filt_no_test = ~join_df['shop_item_id'].isin(shop_item_id)
    join_df_filt = join_df.loc[filt_no_test, :].reset_index(drop=True)
    join_df_filt['data_split'].value_counts()
    join_df['data_split'].value_counts()

    print('Recast data ...')

    join_df_filt = recast_df(dataset=join_df_filt)

    shape = join_df_filt.shape

    print('Outputting file {} ...'.format(shape))

    # output aggreated base data as feather file
    join_df_filt.to_feather(cons.base_agg_comp_fpath)

    return 0
Esempio n. 5
0
def gen_shift_attrs(cons):
    """
    
    Generate Shift Attributes Documentation
    
    Function Overview
    
    This function generates shift attributes for a specified column.
    It is achieved by creating a pivot table on shop_id and item_id by date_block_num.
    Additiona; attributes are also created such as months since first and last purchase, and number of item price changes.
    
    Defaults
    
    gen_shift_attrs(cons)
    
    Parameters
    
    cons - Python Module, the programme constants for the competition
    
    Returns
    
    0 for successful execution
    
    Example
    
    gen_shift_attrs(cons = cons)
    
    """

    # output aggreated base data as feather file
    base_agg_comp = pd.read_feather(cons.base_agg_supp_fpath)

    shape = base_agg_comp.shape

    print(shape)

    # set function inputs for total aggregate attributes
    values_total = ['item_cnt_day']
    index_total = ['date_block_num']
    fill_na = 0

    # set additional function inputs for total shift attributes
    #columns_shift_shop_total = ['shop_id']
    #columns_shift_item_total = ['item_id']

    #TODO: aggregate total month sales and lag by one
    #TODO: aggregate by shop category and sub catgory, city

    ###############################
    #-- Mean / Total Aggregates --#
    ###############################

    print('Calculating sold item totals for shop id ...')

    # generate the shop sell totals
    base_agg_comp = gen_attr_agg_totals(dataset=base_agg_comp,
                                        values=values_total,
                                        index=index_total,
                                        columns=['shop_id'],
                                        fill_value=fill_na)

    print('Calculating sold item totals for item id ...')

    # generate item sell totals
    base_agg_comp = gen_attr_agg_totals(dataset=base_agg_comp,
                                        values=values_total,
                                        index=index_total,
                                        columns=['item_id'],
                                        fill_value=fill_na)

    print('Calculating sold item totals for item category id ...')

    # generate item sell totals
    base_agg_comp = gen_attr_agg_totals(dataset=base_agg_comp,
                                        values=values_total,
                                        index=index_total,
                                        columns=['item_category_id'],
                                        fill_value=fill_na)

    print('Calculating sold item totals for item category id and shop id ...')

    # generate item sell totals
    base_agg_comp = gen_attr_agg_totals(
        dataset=base_agg_comp,
        values=values_total,
        index=index_total,
        columns=['shop_id', 'item_category_id'],
        fill_value=fill_na)

    print('Calculating sold item totals for city ...')

    # generate item sell totals
    base_agg_comp = gen_attr_agg_totals(dataset=base_agg_comp,
                                        values=values_total,
                                        index=index_total,
                                        columns=['city_enc'],
                                        fill_value=fill_na)

    print('Calculating sold item totals for item id and city ...')

    # generate item sell totals
    base_agg_comp = gen_attr_agg_totals(dataset=base_agg_comp,
                                        values=values_total,
                                        index=index_total,
                                        columns=['item_id', 'city_enc'],
                                        fill_value=fill_na)

    print('Generating months since first and last purchases ...')

    print('shop_id & item_id ...')

    # generate item sell totals
    base_agg_comp = months_since_purchase(dataset=base_agg_comp,
                                          values=['item_cnt_day'],
                                          index=['date_block_num'],
                                          columns=['shop_id', 'item_id'])

    print('item_id ...')

    # generate item sell totals
    base_agg_comp = months_since_purchase(dataset=base_agg_comp,
                                          values=['item_cnt_day'],
                                          index=['date_block_num'],
                                          columns=['item_id'])

    print('Generating number of price changes ...')

    base_agg_comp = n_price_changes(dataset=base_agg_comp,
                                    values=['item_price'],
                                    index=['date_block_num'],
                                    columns=['shop_id', 'item_id'])

    print('Recast data ...')

    base_agg_comp = recast_df(dataset=base_agg_comp)

    # output file to feather file
    model_data = base_agg_comp.reset_index(drop=True)
    model_data.to_feather(cons.base_agg_shft_fpath)

    return 0
Esempio n. 6
0
def append_supplement_attrs(cons):
    """
    
    Append Supplementary Attributes Documenation
    
    Function Overview
    
    This function appends supplementary attributes to the data.
    These attributes include retail calender specific information and price information.
    
    Defaults
    
    append_supplement_attrs(cons)
    
    Parameters
    
    cons - Python Module, the programme constants for the competition
    
    Returns
    
    0 for successful execution
    
    Example
    
    append_supplement_attrs(cons = cons)
    
    """

    # load in the raw data
    item_categories, items, sales_train, sample_submission, shops, test = load_files(
        'clean', cons)

    # output aggreated base data as feather file
    base_agg_comp = pd.read_feather(cons.base_agg_comp_fpath)

    shape = base_agg_comp.shape
    print(shape)

    print('Extract the price info ...')

    # extract price information
    base_agg_comp['price_decimal'] = base_agg_comp['item_price'].astype(
        str).str.extract('\d+\.(\d*)')[0].astype(float)
    base_agg_comp['price_decimal_len'] = base_agg_comp['item_price'].astype(
        str).str.extract('\d+\.(\d*)')[0].str.len()

    print('Joining clean data ...')

    # join all data sets together
    base_agg_comp = base_agg_comp.merge(items, on='item_id', how='left')
    base_agg_comp = base_agg_comp.merge(item_categories,
                                        on='item_category_id',
                                        how='left')
    base_agg_comp = base_agg_comp.merge(shops, on='shop_id', how='left')

    print('Subset required columns ...')

    base_cols = base_agg_comp.columns
    drop_cols = ['item_name', 'item_category_name', 'shop_name']
    sub_cols = base_cols[~base_cols.isin(drop_cols)]
    base_agg_comp = base_agg_comp[sub_cols]

    print('Generate calendar days ...')

    retail_calander = gen_retail_calender()
    join_cols = ['date_block_num']
    base_agg_comp = base_agg_comp.merge(retail_calander,
                                        on=join_cols,
                                        how='left')

    shape = base_agg_comp.shape

    print('Recast data ...')

    base_agg_comp = recast_df(dataset=base_agg_comp)

    print('Outputting supplementary data {} ...'.format(shape))

    # output file as a feather file
    base_agg_comp.to_feather(cons.base_agg_supp_fpath)

    return 0