def pnone_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) print df_train.shape print df_test.shape # user prod add_to_cart_order mean, std priors, train, orders, products, aisles, departments = preprocessing_data() df_user_pnone=pd.DataFrame(priors.groupby('user_id').\ apply(lambda user_orders: sum(user_orders.groupby('order_id').reordered.sum() == 0)\ / float(user_orders.order_id.unique().shape[0])),columns=['pnone']) df_train = df_train.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) df_test = df_test.merge(df_user_pnone,\ how='left', left_on=['user_id'], right_index=True) # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
def extra_feature_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) print df_train.shape print df_test.shape # user prod add_to_cart_order mean, std priors, train, orders, products, aisles, departments = preprocessing_data() # get mean of add to cart order user_prod_avg_add_to_cart_order=\ pd.DataFrame(priors.groupby(['user_id','product_id']).add_to_cart_order.agg(np.mean)) user_prod_avg_add_to_cart_order.rename( columns={'add_to_cart_order':'user_prod_avg_add_to_cart_order'},\ inplace=True) df_train = df_train.merge(user_prod_avg_add_to_cart_order,\ how='left', left_on=['user_id','product_id'], right_index=True) df_test = df_test.merge(user_prod_avg_add_to_cart_order,\ how='left', left_on=['user_id','product_id'], right_index=True) # order_dow processing # priors.groupby(['user_id','product_id']).apply(get_p_dow) # load timeline related data # timeline_data = get_timeline_data() # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
def get_processed_data(path="../data/", train_data_name="train_data", test_data_name="test_data"): if os.path.isfile(path + train_data_name + ".csv.gz") and os.path.isfile(path + test_data_name + ".csv.gz"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) else: print "no data, start processing" df_train, df_test = processing_data() # save the df_train, df_test common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False) return df_train, df_test
def extra_reorder_rate_processing(path="../data/", train_data_name="train_data", test_data_name="test_data"): # load the data df_train = common.load_df(path, train_data_name) df_test = common.load_df(path, test_data_name) df_all = pd.concat([df_train, df_test]) assert (df_all.shape[0] == df_train.shape[0] + df_test.shape[0]) print df_all.shape print df_all.keys() # here the cat 50, 100 is totally wrong. # re-processing here # drop old cat columns drop_cols = [ 'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50', 'user_cat_100', 'user_prod_match_20', 'user_prod_match_50', 'user_prod_match_100', 'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate', 'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate', 'user_cat_50_prod_reorder_rate', 'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate', 'user_cat_100_prod_reorder_rate', 'user_cat_100_prod_cat_100_reorder_rate' ] df_all.drop(drop_cols, axis=1, inplace=True, errors='ignore') print df_all.shape # load user and product category data user_cat_data, prod_cat_data, user_prod_cat_match_data = get_user_prod_cat_data( ) # merge category data df_all = df_all.merge(user_cat_data, how='left', on='user_id') df_all = df_all.merge(prod_cat_data, how='left', on='product_id') df_all = df_all.merge(user_prod_cat_match_data, how='left', on=['user_id', 'product_id']) # reorder_rate processing # aisle reorder rate df_all=df_all.merge(pd.DataFrame(df_all.groupby('aisle_id').apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['aisle_reorder_rate']),\ how='left', left_on='aisle_id', right_index=True) assert ( df_all.groupby('aisle_id').aisle_reorder_rate.apply(lambda x: x.unique( ).shape[0] == 1).sum() == df_all.aisle_id.unique().shape[0]) # department reorder rate df_all=df_all.merge(pd.DataFrame(df_all.groupby('department_id').apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=['department_reorder_rate']),\ how='left', left_on='department_id', right_index=True) assert(df_all.groupby('department_id').department_reorder_rate.apply(lambda x: x.unique().shape[0]==1).sum() ==\ df_all.department_id.unique().shape[0]) nt_list = [20, 50, 100] for num_topics in nt_list: # prod_cat_reorder_rate prod_cat_str = 'prod_cat_' + str(num_topics) df_all = df_all.merge(pd.DataFrame(df_all.groupby(prod_cat_str).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0),columns=[prod_cat_str+'_reorder_rate']),\ how='left', left_on=prod_cat_str, right_index=True) assert(df_all.groupby(prod_cat_str)[prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == df_all[prod_cat_str].unique().shape[0]) # user_cat_prod_reorder_rate # for a given product, the reorder rate of all users who belongs to a particle category user_cat_str = 'user_cat_' + str(num_topics) df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, 'product_id']).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_prod_reorder_rate']),\ how='left', left_on=[user_cat_str,'product_id'],right_index=True) assert(df_all.groupby([user_cat_str,'product_id'])[user_cat_str+'_prod_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == len(df_all.groupby([user_cat_str,'product_id']).groups.keys())) # user_cat_prod_cat_reorder_rate df_all=df_all.merge(pd.DataFrame(df_all.groupby([user_cat_str, prod_cat_str]).apply(lambda orders: \ sum(orders.user_prod_no_of_orders-1) / float(sum(orders.user_prod_orders_since_first_ordered)) \ if (sum(orders.user_prod_orders_since_first_ordered)) > 0 else 0.0), columns=[user_cat_str+'_'+prod_cat_str+'_reorder_rate']), \ how='left', left_on=[user_cat_str, prod_cat_str], right_index=True) assert(df_all.groupby([user_cat_str,prod_cat_str])[user_cat_str+'_'+prod_cat_str+'_reorder_rate'].apply(lambda x: x.unique().shape[0]==1).sum() \ == len(df_all.groupby([user_cat_str,prod_cat_str]).groups.keys())) # set dtypes category_cols = [ 'eval_set', 'prod_cat_20', 'prod_cat_50', 'prod_cat_100', 'user_cat_20', 'user_cat_50', 'user_cat_100' ] for col in category_cols: df_all[col] = df_all[col].astype('category') rate_cols = [ 'aisle_reorder_rate', 'department_reorder_rate', 'prod_cat_20_reorder_rate', 'user_cat_20_prod_reorder_rate', 'user_cat_20_prod_cat_20_reorder_rate', 'prod_cat_50_reorder_rate', 'user_cat_50_prod_reorder_rate', 'user_cat_50_prod_cat_50_reorder_rate', 'prod_cat_100_reorder_rate', 'user_cat_100_prod_reorder_rate', 'user_cat_100_prod_cat_100_reorder_rate' ] for col in rate_cols: df_all[col] = df_all[col].astype('float32') print df_all.shape # use float32 to save # split the train and test for df_all # df_train=df_all[df_all.eval_set=='train'].drop(['add_to_cart_order'],axis=1) # df_test=(df_all[df_all.eval_set=='test']).drop(['add_to_cart_order','reordered'],axis=1) df_train = df_all[df_all.eval_set == 'train'] df_train['reordered'] = df_train['reordered'].astype(np.uint8) df_test = (df_all[df_all.eval_set == 'test']).drop(['reordered'], axis=1) print df_train.shape print df_test.shape # save the df_train, df_test print "save the processed data" common.save_df(df_train, "../data/", train_data_name, index=False) common.save_df(df_test, "../data/", test_data_name, index=False)
up_lda = LdaMulticore(corpus=user_prods, id2word=id2prod, workers=2, num_topics=num_topics, minimum_probability=1e-8, chunksize=2000, passes=1) model_fn = model_fn_prefix + str(num_topics) + ".lda" print "save model into " + model_fn up_lda.save(model_fn) # debug reload model # up_lda = LdaModel.load(model_fn) df_user_cat = get_user_cat(up_lda, df_user_prods) user_cat_name = user_cat_name_prefix + str(num_topics) print "save " + user_cat_name common.save_df(df_user_cat, "../data/", user_cat_name, index=False) df_prod_cat = get_prod_cat(up_lda, products) prod_cat_name = prod_cat_name_prefix + str(num_topics) print "save " + prod_cat_name common.save_df(df_prod_cat, "../data/", prod_cat_name, index=False) print time.ctime() print "finished" print time.ctime() print time.time() - start_time
print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n" # the following is self test code for 10 users # load the data ten_user_orders = pd.read_csv("/tmp/ten_user_orders.csv.gz", compression='gzip') # test for get_user_corpus ten_user_corpus = get_user_corpus(ten_user_orders) # check get_user_corpus result print "compare user_corpus" print ten_user_corpus[ten_user_corpus.user_id==202277].user_corpus.apply(lambda row: list(zip(*row)[0])) print ten_user_orders[ten_user_orders.user_id==202277].product_id.sort_values().tolist() # save corpus common.save_df(ten_user_corpus,"/tmp/", "ten_user_corpus", index=False) # load corpus back load_ten_user_corpus = common.load_df("/tmp/", "ten_user_corpus") print load_ten_user_corpus else: ### formal code IDIR = "../input/" priors, train, orders, products, aisles, departments = common.load_raw_data(IDIR) # only build corpus for priors which is used for cross-validation print('add order info to priors') orders = orders.set_index('order_id', inplace=True, drop=False) priors = priors.join(orders, on='order_id', rsuffix='_') priors.drop('order_id_', inplace=True, axis=1) if "priors" in objects: