def main():
    """Load train and test, map additional data, split validation and save as pickle."""
    print("Read train and test files")
    train, test = read_train_test()

    print("Read and map campaign start and end dates")
    kws = {
        "parse_dates": [FieldNames.campaign_start_date, FieldNames.campaign_end_date],
        "dayfirst": True,
    }
    campaign_data = read_csv(FileNames.campaign, **kws)
    train = pd.merge(train, campaign_data, on="campaign_id", how="left")
    test = pd.merge(test, campaign_data, on="campaign_id", how="left")

    print("Read and map demograhics data")
    demog_data = read_csv(FileNames.demogs)
    train = pd.merge(train, demog_data, on="customer_id", how="left")
    test = pd.merge(test, demog_data, on="customer_id", how="left")
    for col, mapping in [
        (FieldNames.age_range, AGE_MAP),
        (FieldNames.marital_status, MARITAL_STATUS),
        (FieldNames.family_size, FAMILY_SIZE),
        (FieldNames.no_of_children, NO_OF_CHILDREN),
        (FieldNames.campaign_type, CAMPAIGN_TYPE),
    ]:
        train[col] = map_to_float(train, col, mapping)
        test[col] = map_to_float(test, col, mapping)

    print("Read coupon and item details and merge them")
    coupon_data = read_csv(FileNames.coupon_item)
    item_data = read_csv(FileNames.item)
    coupon_data = pd.merge(coupon_data, item_data, on="item_id", how="left")

    print("Map coupon details to train")
    coupon_grouped = coupon_data.groupby("coupon_id").agg(
        {"item_id": list, "brand": list, "brand_type": list, "category": list}
    )
    train = pd.merge(train, coupon_grouped, on="coupon_id", how="left")
    test = pd.merge(test, coupon_grouped, on="coupon_id", how="left")

    train = train.rename(columns={'item_id': FieldNames.item_set})
    test = test.rename(columns={'item_id': FieldNames.item_set})

    print("split train --> tr and val")
    tr = train.loc[~train[FieldNames.campaign_id].isin([11, 12, 13])]
    val = train.loc[train[FieldNames.campaign_id].isin([11, 12, 13])]

    print("save as pickle")
    save_pickle(train, FileNames.train_v2)
    save_pickle(test, FileNames.test_v2)
    save_pickle(tr, FileNames.tr_v2)
    save_pickle(val, FileNames.val_v2)
def main():
    train, test = read_train_test()
    demog_df = read_csv(FileNames.demogs)
    demog_df[FieldNames.no_of_children] = demog_df[
        FieldNames.no_of_children].fillna(0)

    camp_data = read_csv(FileNames.campaign)
    camp_data = convert_to_datetime(camp_data, FieldNames.campaign_start_date,
                                    **{'dayfirst': True})
    camp_data = convert_to_datetime(camp_data, FieldNames.campaign_end_date,
                                    **{'dayfirst': True})
    camp_data[FieldNames.date_int] = (
        camp_data[FieldNames.campaign_end_date].astype(int) /
        10**12).astype(int)

    print([demog_df[col].unique() for col in demog_df.columns])
    train = pd.merge(train, demog_df, on='customer_id', how='left')
    test = pd.merge(test, demog_df, on='customer_id', how='left')

    train = pd.merge(train, camp_data, on='campaign_id', how='left')
    test = pd.merge(test, camp_data, on='campaign_id', how='left')

    for col, mapping in [(FieldNames.age_range, AGE_MAP),
                         (FieldNames.marital_status, MARITAL_STATUS),
                         (FieldNames.family_size, FAMILY_SIZE),
                         (FieldNames.no_of_children, NO_OF_CHILDREN)]:
        train[col] = map_to_float(train, col, mapping)
        test[col] = map_to_float(test, col, mapping)

    tr, val = split_train_validation(train, val_campaigns=(12, 13))
    write_csv(train, FileNames.train_v0)
    write_csv(test, FileNames.test_v0)
    write_csv(tr, FileNames.tr_v0)
    write_csv(val, FileNames.val_v0)

    tr, val = split_train_validation(train, val_campaigns=(26, 27, 28, 29, 30))
    write_csv(tr, FileNames.tr_v1)
    write_csv(val, FileNames.val_v1)

    cust_transactions = read_csv(FileNames.transaction)
    item_data = read_csv(FileNames.item)
    cust_transactions = pd.merge(cust_transactions,
                                 item_data,
                                 on=FieldNames.item_id,
                                 how='left')
    cust_transactions = convert_to_datetime(cust_transactions,
                                            FieldNames.transaction_date)
    cust_transactions_v0 = cust_transactions.loc[
        cust_transactions[FieldNames.transaction_date] <= '2013-05-10']
    write_csv(cust_transactions, FileNames.transaction_test_v0)
    write_csv(cust_transactions_v0, FileNames.transaction_val_v0)
def map_coupon_items(df):
    coupon_items = read_csv(FileNames.coupon_item)
    item_data = read_csv(FileNames.item)
    coupon_items = pd.merge(coupon_items,
                            item_data,
                            on=FieldNames.item_id,
                            how="left")
    coupon_items_map = (coupon_items.groupby(
        FieldNames.coupon_id)[FieldNames.item_id].apply(set).to_dict())
    coupon_brand_map = (coupon_items.groupby(
        FieldNames.coupon_id)[FieldNames.item_brand].apply(set).to_dict())
    coupon_category_map = (coupon_items.groupby(
        FieldNames.coupon_id)[FieldNames.item_category].apply(set).to_dict())

    df[FieldNames.item_set] = df[FieldNames.coupon_id].map(coupon_items_map)
    df[FieldNames.item_brand] = df[FieldNames.coupon_id].map(coupon_brand_map)
    df[FieldNames.item_category] = df[FieldNames.coupon_id].map(
        coupon_category_map)
    return df
def prepare_transactions():
    """Create validation customer transaction data; Aggregate by date and user."""
    cust_transact = read_csv(FileNames.transaction,
                             **{"parse_dates": [FieldNames.transaction_date]})
    item_details = read_csv(FileNames.item)
    cust_transact = pd.merge(cust_transact,
                             item_details,
                             on=FieldNames.item_id,
                             how="left")
    cust_transact[FieldNames.pct_discount] = (
        cust_transact[FieldNames.coupon_discount] /
        cust_transact[FieldNames.selling_price])
    cust_transact[FieldNames.transaction_dayofweek] = cust_transact[
        FieldNames.transaction_date].dt.dayofweek
    cust_transact_tr = cust_transact.loc[
        cust_transact[FieldNames.transaction_date] <= "2013-05-10"]

    print("Saving to pickle")
    save_pickle(cust_transact, FileNames.transaction_test_v1)
    save_pickle(cust_transact_tr, FileNames.transaction_val_v1)
def brand_cat_mapes():
    item_data = read_csv(FileNames.item)
    brand_type_map = {
        v: i
        for i, v in enumerate(item_data[FieldNames.item_brand_type].unique())
    }
    category_map = {
        v: i
        for i, v in enumerate(item_data[FieldNames.item_category].unique())
    }
    return brand_type_map, category_map
def load_train_test(flag="test", version="v0"):
    """Load data."""
    tr_fname, te_fname = get_file_strings(flag, version)
    tr = read_csv(tr_fname)
    te = read_csv(te_fname)

    tr[FieldNames.campaign_start_date] = pd.to_datetime(
        tr[FieldNames.campaign_start_date])
    tr[FieldNames.campaign_end_date] = pd.to_datetime(
        tr[FieldNames.campaign_end_date])

    te[FieldNames.campaign_start_date] = pd.to_datetime(
        te[FieldNames.campaign_start_date])
    te[FieldNames.campaign_end_date] = pd.to_datetime(
        te[FieldNames.campaign_end_date])

    tr = map_coupon_items(tr)
    te = map_coupon_items(te)

    # tr = map_transact_agg(tr, flag)
    # te = map_transact_agg(te, flag)

    return tr, te
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

from mllib.params import FieldNames, FileNames
from mllib.utils import read_csv

if __name__ == '__main__':
    coupon_item = read_csv(FileNames.coupon_item)
    data = np.ones(len(coupon_item), )
    A_sparse = csr_matrix((data, (coupon_item[FieldNames.coupon_id].values,
                                  coupon_item[FieldNames.item_id].values)))
    nmf = NMF(16)
    coupon_vectors = nmf.fit_transform(A_sparse)
    print("Done fitting model.")
    item_vectors = nmf.components_.T
    name = 'nmf'
    np.save('data/coupon_vectors_{}.npy'.format(name), coupon_vectors)
    np.save('data/item_vectors_{}.npy'.format(name), item_vectors)
    print("Done saving nmf vectors.")
def load_artifacts(flag="test", version="v0"):
    """Load artifacts required for transformers."""
    tr_fname, _ = get_file_strings(flag, version)
    tr = read_csv(tr_fname)
    tr = convert_to_datetime(tr, FieldNames.campaign_start_date,
                             **{"dayfirst": True})
    tr_artifact = HistoricalArtifact(
        tr,
        user_field=FieldNames.customer_id,
        date_field=FieldNames.campaign_start_date,
        key_fields=[
            FieldNames.campaign_id, FieldNames.coupon_id, FieldNames.target
        ],
    )
    del tr
    transaction_file = "transaction_{flag}_v0".format(flag=flag)
    transaction_file = getattr(FileNames, transaction_file)
    transactions = read_csv(transaction_file)
    transactions = convert_to_datetime(transactions,
                                       col=FieldNames.transaction_date)
    transactions[FieldNames.transaction_dayofweek] = transactions[
        FieldNames.transaction_date].dt.dayofweek
    transactions[FieldNames.coupon_discount] = np.abs(
        transactions[FieldNames.coupon_discount])
    transactions[FieldNames.other_discount] = np.abs(
        transactions[FieldNames.other_discount])
    transactions[
        FieldNames.pct_discount] = transactions[FieldNames.coupon_discount] / (
            1 + transactions[FieldNames.selling_price])

    transactions2 = transactions.loc[
        transactions[FieldNames.coupon_discount] > 0, :]
    transactions2 = group_transactions(transactions2)

    transactions3 = transactions.loc[transactions[FieldNames.coupon_discount] >
                                     transactions[FieldNames.other_discount]]
    transactions3 = group_transactions(transactions3)

    transactions = group_transactions(transactions)
    print(transactions.head(), transactions2.head())
    cust_artifact1 = HistoricalArtifact(
        transactions,
        user_field=FieldNames.customer_id,
        date_field=FieldNames.transaction_date,
        key_fields=[
            FieldNames.item_set,
            FieldNames.item_brand,
            FieldNames.item_category,
            FieldNames.pct_discount,
            FieldNames.selling_price,
            FieldNames.coupon_discount,
            FieldNames.other_discount,
            FieldNames.quantity,
            FieldNames.transaction_dayofweek,
        ],
    )

    cust_artifact2 = HistoricalArtifact(
        transactions2,
        user_field=FieldNames.customer_id,
        date_field=FieldNames.transaction_date,
        key_fields=[
            FieldNames.item_set,
            FieldNames.item_brand,
            FieldNames.item_category,
            FieldNames.pct_discount,
            FieldNames.selling_price,
            FieldNames.coupon_discount,
            FieldNames.other_discount,
            FieldNames.quantity,
        ],
    )

    cust_artifact3 = HistoricalArtifact(
        transactions3,
        user_field=FieldNames.customer_id,
        date_field=FieldNames.transaction_date,
        key_fields=[
            FieldNames.item_set,
            FieldNames.item_brand,
            FieldNames.item_category,
            FieldNames.pct_discount,
            FieldNames.selling_price,
            FieldNames.coupon_discount,
            FieldNames.other_discount,
            FieldNames.quantity,
        ],
    )

    return tr_artifact, cust_artifact1, cust_artifact2, cust_artifact3