def main(): """Load train and test, map additional data, split validation and save as pickle.""" print("Read train and test files") train, test = read_train_test() print("Read and map campaign start and end dates") kws = { "parse_dates": [FieldNames.campaign_start_date, FieldNames.campaign_end_date], "dayfirst": True, } campaign_data = read_csv(FileNames.campaign, **kws) train = pd.merge(train, campaign_data, on="campaign_id", how="left") test = pd.merge(test, campaign_data, on="campaign_id", how="left") print("Read and map demograhics data") demog_data = read_csv(FileNames.demogs) train = pd.merge(train, demog_data, on="customer_id", how="left") test = pd.merge(test, demog_data, on="customer_id", how="left") for col, mapping in [ (FieldNames.age_range, AGE_MAP), (FieldNames.marital_status, MARITAL_STATUS), (FieldNames.family_size, FAMILY_SIZE), (FieldNames.no_of_children, NO_OF_CHILDREN), (FieldNames.campaign_type, CAMPAIGN_TYPE), ]: train[col] = map_to_float(train, col, mapping) test[col] = map_to_float(test, col, mapping) print("Read coupon and item details and merge them") coupon_data = read_csv(FileNames.coupon_item) item_data = read_csv(FileNames.item) coupon_data = pd.merge(coupon_data, item_data, on="item_id", how="left") print("Map coupon details to train") coupon_grouped = coupon_data.groupby("coupon_id").agg( {"item_id": list, "brand": list, "brand_type": list, "category": list} ) train = pd.merge(train, coupon_grouped, on="coupon_id", how="left") test = pd.merge(test, coupon_grouped, on="coupon_id", how="left") train = train.rename(columns={'item_id': FieldNames.item_set}) test = test.rename(columns={'item_id': FieldNames.item_set}) print("split train --> tr and val") tr = train.loc[~train[FieldNames.campaign_id].isin([11, 12, 13])] val = train.loc[train[FieldNames.campaign_id].isin([11, 12, 13])] print("save as pickle") save_pickle(train, FileNames.train_v2) save_pickle(test, FileNames.test_v2) save_pickle(tr, FileNames.tr_v2) save_pickle(val, FileNames.val_v2)
def main(): train, test = read_train_test() demog_df = read_csv(FileNames.demogs) demog_df[FieldNames.no_of_children] = demog_df[ FieldNames.no_of_children].fillna(0) camp_data = read_csv(FileNames.campaign) camp_data = convert_to_datetime(camp_data, FieldNames.campaign_start_date, **{'dayfirst': True}) camp_data = convert_to_datetime(camp_data, FieldNames.campaign_end_date, **{'dayfirst': True}) camp_data[FieldNames.date_int] = ( camp_data[FieldNames.campaign_end_date].astype(int) / 10**12).astype(int) print([demog_df[col].unique() for col in demog_df.columns]) train = pd.merge(train, demog_df, on='customer_id', how='left') test = pd.merge(test, demog_df, on='customer_id', how='left') train = pd.merge(train, camp_data, on='campaign_id', how='left') test = pd.merge(test, camp_data, on='campaign_id', how='left') for col, mapping in [(FieldNames.age_range, AGE_MAP), (FieldNames.marital_status, MARITAL_STATUS), (FieldNames.family_size, FAMILY_SIZE), (FieldNames.no_of_children, NO_OF_CHILDREN)]: train[col] = map_to_float(train, col, mapping) test[col] = map_to_float(test, col, mapping) tr, val = split_train_validation(train, val_campaigns=(12, 13)) write_csv(train, FileNames.train_v0) write_csv(test, FileNames.test_v0) write_csv(tr, FileNames.tr_v0) write_csv(val, FileNames.val_v0) tr, val = split_train_validation(train, val_campaigns=(26, 27, 28, 29, 30)) write_csv(tr, FileNames.tr_v1) write_csv(val, FileNames.val_v1) cust_transactions = read_csv(FileNames.transaction) item_data = read_csv(FileNames.item) cust_transactions = pd.merge(cust_transactions, item_data, on=FieldNames.item_id, how='left') cust_transactions = convert_to_datetime(cust_transactions, FieldNames.transaction_date) cust_transactions_v0 = cust_transactions.loc[ cust_transactions[FieldNames.transaction_date] <= '2013-05-10'] write_csv(cust_transactions, FileNames.transaction_test_v0) write_csv(cust_transactions_v0, FileNames.transaction_val_v0)
def map_coupon_items(df): coupon_items = read_csv(FileNames.coupon_item) item_data = read_csv(FileNames.item) coupon_items = pd.merge(coupon_items, item_data, on=FieldNames.item_id, how="left") coupon_items_map = (coupon_items.groupby( FieldNames.coupon_id)[FieldNames.item_id].apply(set).to_dict()) coupon_brand_map = (coupon_items.groupby( FieldNames.coupon_id)[FieldNames.item_brand].apply(set).to_dict()) coupon_category_map = (coupon_items.groupby( FieldNames.coupon_id)[FieldNames.item_category].apply(set).to_dict()) df[FieldNames.item_set] = df[FieldNames.coupon_id].map(coupon_items_map) df[FieldNames.item_brand] = df[FieldNames.coupon_id].map(coupon_brand_map) df[FieldNames.item_category] = df[FieldNames.coupon_id].map( coupon_category_map) return df
def prepare_transactions(): """Create validation customer transaction data; Aggregate by date and user.""" cust_transact = read_csv(FileNames.transaction, **{"parse_dates": [FieldNames.transaction_date]}) item_details = read_csv(FileNames.item) cust_transact = pd.merge(cust_transact, item_details, on=FieldNames.item_id, how="left") cust_transact[FieldNames.pct_discount] = ( cust_transact[FieldNames.coupon_discount] / cust_transact[FieldNames.selling_price]) cust_transact[FieldNames.transaction_dayofweek] = cust_transact[ FieldNames.transaction_date].dt.dayofweek cust_transact_tr = cust_transact.loc[ cust_transact[FieldNames.transaction_date] <= "2013-05-10"] print("Saving to pickle") save_pickle(cust_transact, FileNames.transaction_test_v1) save_pickle(cust_transact_tr, FileNames.transaction_val_v1)
def brand_cat_mapes(): item_data = read_csv(FileNames.item) brand_type_map = { v: i for i, v in enumerate(item_data[FieldNames.item_brand_type].unique()) } category_map = { v: i for i, v in enumerate(item_data[FieldNames.item_category].unique()) } return brand_type_map, category_map
def load_train_test(flag="test", version="v0"): """Load data.""" tr_fname, te_fname = get_file_strings(flag, version) tr = read_csv(tr_fname) te = read_csv(te_fname) tr[FieldNames.campaign_start_date] = pd.to_datetime( tr[FieldNames.campaign_start_date]) tr[FieldNames.campaign_end_date] = pd.to_datetime( tr[FieldNames.campaign_end_date]) te[FieldNames.campaign_start_date] = pd.to_datetime( te[FieldNames.campaign_start_date]) te[FieldNames.campaign_end_date] = pd.to_datetime( te[FieldNames.campaign_end_date]) tr = map_coupon_items(tr) te = map_coupon_items(te) # tr = map_transact_agg(tr, flag) # te = map_transact_agg(te, flag) return tr, te
import numpy as np import pandas as pd from scipy.sparse import csr_matrix from sklearn.decomposition import NMF from mllib.params import FieldNames, FileNames from mllib.utils import read_csv if __name__ == '__main__': coupon_item = read_csv(FileNames.coupon_item) data = np.ones(len(coupon_item), ) A_sparse = csr_matrix((data, (coupon_item[FieldNames.coupon_id].values, coupon_item[FieldNames.item_id].values))) nmf = NMF(16) coupon_vectors = nmf.fit_transform(A_sparse) print("Done fitting model.") item_vectors = nmf.components_.T name = 'nmf' np.save('data/coupon_vectors_{}.npy'.format(name), coupon_vectors) np.save('data/item_vectors_{}.npy'.format(name), item_vectors) print("Done saving nmf vectors.")
def load_artifacts(flag="test", version="v0"): """Load artifacts required for transformers.""" tr_fname, _ = get_file_strings(flag, version) tr = read_csv(tr_fname) tr = convert_to_datetime(tr, FieldNames.campaign_start_date, **{"dayfirst": True}) tr_artifact = HistoricalArtifact( tr, user_field=FieldNames.customer_id, date_field=FieldNames.campaign_start_date, key_fields=[ FieldNames.campaign_id, FieldNames.coupon_id, FieldNames.target ], ) del tr transaction_file = "transaction_{flag}_v0".format(flag=flag) transaction_file = getattr(FileNames, transaction_file) transactions = read_csv(transaction_file) transactions = convert_to_datetime(transactions, col=FieldNames.transaction_date) transactions[FieldNames.transaction_dayofweek] = transactions[ FieldNames.transaction_date].dt.dayofweek transactions[FieldNames.coupon_discount] = np.abs( transactions[FieldNames.coupon_discount]) transactions[FieldNames.other_discount] = np.abs( transactions[FieldNames.other_discount]) transactions[ FieldNames.pct_discount] = transactions[FieldNames.coupon_discount] / ( 1 + transactions[FieldNames.selling_price]) transactions2 = transactions.loc[ transactions[FieldNames.coupon_discount] > 0, :] transactions2 = group_transactions(transactions2) transactions3 = transactions.loc[transactions[FieldNames.coupon_discount] > transactions[FieldNames.other_discount]] transactions3 = group_transactions(transactions3) transactions = group_transactions(transactions) print(transactions.head(), transactions2.head()) cust_artifact1 = HistoricalArtifact( transactions, user_field=FieldNames.customer_id, date_field=FieldNames.transaction_date, key_fields=[ FieldNames.item_set, FieldNames.item_brand, FieldNames.item_category, FieldNames.pct_discount, FieldNames.selling_price, FieldNames.coupon_discount, FieldNames.other_discount, FieldNames.quantity, FieldNames.transaction_dayofweek, ], ) cust_artifact2 = HistoricalArtifact( transactions2, user_field=FieldNames.customer_id, date_field=FieldNames.transaction_date, key_fields=[ FieldNames.item_set, FieldNames.item_brand, FieldNames.item_category, FieldNames.pct_discount, FieldNames.selling_price, FieldNames.coupon_discount, FieldNames.other_discount, FieldNames.quantity, ], ) cust_artifact3 = HistoricalArtifact( transactions3, user_field=FieldNames.customer_id, date_field=FieldNames.transaction_date, key_fields=[ FieldNames.item_set, FieldNames.item_brand, FieldNames.item_category, FieldNames.pct_discount, FieldNames.selling_price, FieldNames.coupon_discount, FieldNames.other_discount, FieldNames.quantity, ], ) return tr_artifact, cust_artifact1, cust_artifact2, cust_artifact3