def attach_user_act(X, month): col_names = np.genfromtxt( paths.my_path + 'user_act_counts_month_{0}{1}_col_names.csv'.format(1, month - 1), delimiter=',', dtype=str) full_act = Matrix( np.genfromtxt(paths.my_path + 'user_act_counts_month_{0}{1}.csv'.format(1, month - 1), delimiter=',', dtype=float), col_names[0, :], col_formats=["%s" for i in xrange(col_names.shape[1] - 1)]) full_act_hash = HashSet() for i in xrange(full_act.ndata): if i % 100000 == 0: print i uid = full_act.get_cell(i, "uid") full_act_hash.set(uid, full_act.matrix[i, 1:]) X_full_act = np.zeros((X.ndata, col_names.shape[1] - 1)) for i in xrange(X.ndata): if i % 100000 == 0: print i uid = X.get_cell(i, "uid") X_full_act[i, :] = full_act_hash.get(uid, np.zeros(col_names.shape[1] - 1)) col_names_check_point = [] for name in col_names[0, 1:]: col_names_check_point.append('{0}_user_all_month'.format(name)) X.cat_col(X_full_act, col_names_check_point, ["%s" for i in xrange(len(col_names_check_point))])
def regen_feature_by_month(func, month, feature_name): col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month), delimiter=',', dtype=str) col_number = col_names.shape[1] X = Matrix( np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])]) X.drop(feature_name) func(X, month, feature_name)
def attach_full_act_history(X, month): for k in xrange(month - 1, 0, -1): print k col_names_history = np.genfromtxt( paths.my_path + 'act_counts_month_{0}_col_names.csv'.format(k), delimiter=',', dtype=str) history = Matrix( np.genfromtxt(paths.my_path + 'act_counts_month_{0}.csv'.format(k), delimiter=',', dtype=float), col_names_history[0, :], ["%s" for i in xrange(col_names_history.shape[1])]) history_hash = HashSet() for i in xrange(history.ndata): if i % 100000 == 0: print i uid = history.get_cell(i, "uid") mid = history.get_cell(i, "mid") history_hash.get(uid, HashSet()).set(mid, history.matrix[i, 3:]) X_history = np.zeros((X.ndata, col_names_history.shape[1] - 3)) for i in xrange(X.ndata): if i % 100000 == 0: print i uid = X.get_cell(i, "uid") mid = X.get_cell(i, "mid") X_history[i, :] = history_hash.get(uid, HashSet()).get( mid, np.zeros(col_names_history.shape[1] - 3)) col_names = [] for name in col_names_history[0, 3:]: col_names.append('{0}_month_{1}'.format(name, month - k)) X.cat_col(X_history, col_names, ["%s" for i in xrange(len(col_names))]) for k in xrange(7 - month): col_names = [] for name in col_names_history[0, 3:]: col_names.append('{0}_month_{1}'.format(name, month - k)) X_history = np.zeros((X.ndata, len(col_names))) X.cat_col(X_history, col_names, ["%s" for i in xrange(len(col_names))])
def append_feature_by_month(func, month): col_names = np.genfromtxt( paths.my_path + 'X_{0}_{1}{2}_shop_month_col_names.csv'.format(month, 1, month - 1), delimiter=',', dtype=str) col_number = col_names.shape[1] X = Matrix( np.genfromtxt( paths.my_path + 'X_{0}_{1}{2}_shop_month.csv'.format(month, 1, month - 1), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])]) func(X, month)
def drop_multiple(feature_names, month): col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month), delimiter=',', dtype=str) col_number = col_names.shape[1] X = Matrix( np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])]) for name in feature_names: X.drop(name) X.check_point(month)
def drop(feature_name, month, checkpoint=True): col_names = np.genfromtxt(paths.my_path + 'col_names{0}.csv'.format(month), delimiter=',', dtype=str) col_number = col_names.shape[1] X = Matrix( np.genfromtxt(paths.my_path + 'X{0}.csv'.format(month), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])]) X.drop(feature_name) if checkpoint: X.check_point(month)
def gen_by_month(target_file, month, since=None): if month < 7: X = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_test_' + str(month) + '.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) else: X = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_test_' + str(month) + '.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec"], ["%s" for i in xrange(6)]) print "generating month {0}".format(month) print "gen_user_shop_features" gen_user_shop_features(X, month) print "gen_no_penalty_user_shop_features" gen_no_penalty_user_shop_features(X, month) print "gen_basic_features" gen_basic_features(X) print "attach_full_act_history" attach_full_act_history(X, month) print "attach_user_act_history" attach_user_act_history(X, month) print "attach_full_act" attach_full_act(X, month) print "attach_user_act" attach_user_act(X, month) if month < 7: print "gen_label" gen_label(X) X.check_point(target_file)
def gen_act_counts_by_month(month): offline = Matrix( np.genfromtxt(paths.ccf_path + 'offline_1_month{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + 'online_1_month{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) user_hash_set = HashSet() full_hash_set = HashSet() ndata = 0 for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") cid_str = offline.get_cell(i, "cid") date_str = offline.get_cell(i, "date") date_rec_str = offline.get_cell(i, "date_rec") if date_str != 'null': act_counts = full_hash_set.get(uid_str, HashSet()).get( mid_str, np.zeros(3, dtype=float)) user_act_counts = user_hash_set.get(uid_str, np.zeros(3, dtype=float)) if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0: ndata += 1 if cid_str != 'null': act_counts[2] += 1 user_act_counts[2] += 1 else: act_counts[1] += 1 user_act_counts[1] += 1 elif date_rec_str != 'null': act_counts = full_hash_set.get(uid_str, HashSet()).get( mid_str, np.zeros(3, dtype=float)) user_act_counts = user_hash_set.get(uid_str, np.zeros(3, dtype=float)) if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0: ndata += 1 act_counts[0] += 1 user_act_counts[0] += 1 for i in xrange(online.ndata): if i % 100000 == 0: print i uid_str = online.get_cell(i, "uid") mid_str = online.get_cell(i, "mid") cid_str = online.get_cell(i, "cid") date_str = online.get_cell(i, "date") date_rec_str = online.get_cell(i, "date_rec") act_str = online.get_cell(i, "act") if date_str != 'null' and act_str == '1': act_counts = full_hash_set.get(uid_str, HashSet()).get( mid_str, np.zeros(3, dtype=float)) user_act_counts = user_hash_set.get(uid_str, np.zeros(3, dtype=float)) if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0: ndata += 1 if cid_str != 'null': act_counts[2] += 1 user_act_counts[2] += 1 else: act_counts[1] += 1 user_act_counts[1] += 1 elif date_rec_str != 'null': assert act_str != '0' act_counts = full_hash_set.get(uid_str, HashSet()).get( mid_str, np.zeros(3, dtype=float)) user_act_counts = user_hash_set.get(uid_str, np.zeros(3, dtype=float)) if act_counts[0] == 0 and act_counts[1] == 0 and act_counts[2] == 0: ndata += 1 act_counts[0] += 1 user_act_counts[0] += 1 col_names = ["uid", "mid", \ "unused_coupon", "buy_without_coupon", "use_coupon", \ "total_coupon", "total_buy", \ "act_ratio_0", "act_ratio_1", "act_ratio_2", \ "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \ "unused_coupon_shop_ratio", "buy_without_coupon_shop_ratio", "use_coupon_shop_ratio"] full_table = Matrix(np.zeros((ndata, len(col_names))), col_names, col_formats=["%s" for i in xrange(len(col_names))]) row_index = 0 for uid in full_hash_set.get_keys(): user_act_counts = user_hash_set.get(uid) user_total_unused_coupon = user_act_counts[0] user_total_buy_without_coupon = user_act_counts[1] user_total_use_coupon = user_act_counts[2] for mid in full_hash_set.get(uid).get_keys(): if row_index % 100000 == 0: print row_index acts = full_hash_set.get(uid).get(mid) use_coupon = acts[2] unused_coupon = acts[0] buy_without_coupon = acts[1] total_acts = use_coupon + unused_coupon + buy_without_coupon total_coupon = use_coupon + unused_coupon total_buy = use_coupon + buy_without_coupon def divide(x, y): if y == 0: return 0 else: return x * 1.0 / y full_table.set_row(row_index, np.array([uid, mid, unused_coupon, buy_without_coupon, use_coupon, \ total_coupon, total_buy, \ divide(unused_coupon, total_acts), divide(buy_without_coupon, total_acts), divide(use_coupon, total_acts), \ divide(use_coupon, total_coupon), divide(unused_coupon, total_coupon), \ divide(use_coupon, total_buy), divide(buy_without_coupon, total_buy), \ divide(unused_coupon, user_total_unused_coupon), divide(buy_without_coupon, user_total_buy_without_coupon), divide(use_coupon, user_total_use_coupon)])) row_index += 1 full_table.check_point("act_counts_month_{0}".format(month))
def calc_give_after_purchase(): offline = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_offline_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) #online = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_online_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) user_shop_coupon_buy = HashSet() for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") date_str = offline.get_cell(i, "date") if date_str != 'null': date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, []) date_list.append(date_str) shop_coupon = {} shop_give_after_purchase = {} shop_give_after_purchase_used = {} shop_coupon_used = {} #user_shop_coupon = HashSet() #user_shop_coupon_give_after_purchase = HashSet() #user_shop_coupon_give_after_purchase_used = HashSet() #user_shop_coupon_used = HashSet() #user_coupon = HashSet() #user_give_after_purchase = HashSet() #user_give_after_purchase_used = HashSet() #user_coupon_used = HashSet() for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") date_str = offline.get_cell(i, "date") cid_str = offline.get_cell(i, "cid") date_rec_str = offline.get_cell(i, "date_rec") date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, []) if cid_str != 'null': #user_shop_coupon.get(uid_str, HashSet()).add_one(mid_str) #user_coupon.add_one(uid_str) if mid_str not in shop_coupon: shop_coupon[mid_str] = 1 else: shop_coupon[mid_str] += 1 date_list = user_shop_coupon_buy.get(uid_str, HashSet()).get(mid_str, []) if date_rec_str in date_list: #user_shop_coupon_give_after_purchase.get(uid_str, HashSet()).add_one(mid_str) #user_give_after_purchase.add_one(uid_str) if mid_str not in shop_give_after_purchase: shop_give_after_purchase[mid_str] = 1 else: shop_give_after_purchase[mid_str] += 1 if date_str != 'null': #user_shop_coupon_give_after_purchase_used.get(uid_str, HashSet()).add_one(mid_str) #user_give_after_purchase_used.add_one(uid_str) if mid_str not in shop_give_after_purchase_used: shop_give_after_purchase_used[mid_str] = 1 else: shop_give_after_purchase_used[mid_str] += 1 if date_str != 'null': #user_shop_coupon_used.get(uid_str, HashSet()).add_one(mid_str) #user_coupon_used.add_one(uid_str) if mid_str not in shop_coupon_used: shop_coupon_used[mid_str] = 1 else: shop_coupon_used[mid_str] += 1 give_after_purchase_statistics_names = ["shop_coupon", "shop_coupon_used", "shop_coupon_give_after_purchase", "shop_coupon_give_after_purchase_used"] give_after_purchase_statistics = np.zeros((offline.ndata, len(give_after_purchase_statistics_names))) for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") shop_coupon_counter = shop_coupon.get(mid_str, 0) shop_coupon_used_counter = shop_coupon_used.get(mid_str, 0) shop_coupon_give_after_purchase_counter = shop_give_after_purchase.get(mid_str, 0) shop_coupon_give_after_purchase_used_counter = shop_give_after_purchase_used.get(mid_str, 0) #if uid_str in user_shop_coupon: # user_shop_coupon_counter = user_shop_coupon.get(uid_str).get(mid_str, 0) #else: # user_shop_coupon_counter = 0 #if uid_str in user_shop_coupon_used: # user_shop_coupon_used_counter = user_shop_coupon_used.get(uid_str).get(mid_str, 0) #else: # user_shop_coupon_used_counter = 0 #if uid_str in user_shop_coupon_give_after_purchase: # user_shop_coupon_give_after_purchase_counter = user_shop_coupon_give_after_purchase.get(mid_str, 0) #else: # user_shop_coupon_give_after_purchase_counter = 0 #if uid_str in user_shop_coupon_give_after_purchase_used: # user_shop_coupon_give_after_purchase_used_counter = user_shop_coupon_give_after_purchase_used.get(mid_str, 0) #else: # user_shop_coupon_give_after_purchase_used_counter = 0 #if not user_coupon_used.has(uid_str) or user_coupon.get(uid_str) == 0: # user_coupon_use_ratio = 0 #else: # user_coupon_use_ratio = user_coupon_used.get(uid_str) * 1.0 / user_coupon.get(uid_str) #if not user_give_after_purchase_used.has(uid_str) or user_give_after_purchase_used.get(uid_str) == 0: # user_give_after_purchase_use_ratio = 0 #else: # user_give_after_purchase_use_ratio = user_give_after_purchase_used.get(uid_str) * 1.0 / user_give_after_purchase.get(uid_str) #if not shop_give_after_purchase_used.has(mid_str) or shop_give_after_purchase_used.get(mid_str) == 0: # shop_coupon_give_after_purchase_use_ratio = 0 #else: # shop_coupon_give_after_purchase_use_ratio = shop_give_after_purchase_used.get(mid_str) * 1.0 / shop_give_after_purchase.get(mid_str) #if not user_shop_coupon_used.has(uid_str) or not user_shop_coupon_used.get(uid_str).has(mid_str): # user_shop_coupon_use_ratio = 0 #else: # user_shop_coupon_use_ratio = user_shop_coupon_used.get(uid_str).get(mid_str) * 1.0 / user_shop_coupon.get(uid_str).get(mid_str) #if not user_shop_coupon_give_after_purchase_used.has(uid_str) or not user_shop_coupon_give_after_purchase.get(uid_str).has(mid_str): # user_shop_coupon_use_after_purchase_ratio = 0 #else: # user_shop_coupon_use_after_purchase_ratio = user_shop_coupon_give_after_purchase_used.get(uid_str).get(mid_str) * 1.0 / user_shop_coupon_give_after_purchase.get(uid_str).get(mid_str) give_after_purchase_statistics[i, :] = np.array([shop_coupon_counter, shop_coupon_used_counter, shop_coupon_give_after_purchase_counter, shop_coupon_give_after_purchase_used_counter]) offline.cat_col(give_after_purchase_statistics, give_after_purchase_statistics_names, ["%s" for i in xrange(len(give_after_purchase_statistics_names))]) offline.check_point("give_after_purchase_statistics")
def aggregate_shops_months(since, to): user_act_counts = HashSet() for k in xrange(since, to + 1): col_names = np.genfromtxt( paths.my_path + 'user_act_counts_month_{0}_col_names.csv'.format(k), delimiter=',', dtype=str) user_table = Matrix( np.genfromtxt(paths.my_path + 'user_act_counts_month_{0}.csv'.format(k), delimiter=',', dtype=float), col_names[0, :], 0.0) for i in xrange(user_table.ndata): uid = user_table.get_cell(i, "uid") acts = user_act_counts.get(uid, np.zeros((to + 1 - since, 3))) acts[k - since, 0] += user_table.get_cell(i, "unused_coupon") acts[k - since, 1] += user_table.get_cell(i, "buy_without_coupon") acts[k - since, 2] += user_table.get_cell(i, "use_coupon") user_col_names = ["uid", "unused_coupon", "buy_without_coupon", "use_coupon", "total_coupon", "total_buy", \ "act_ratio_0", "act_ratio_1", "act_ratio_2", \ "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \ "trend_unused_coupon", "trend_buy_without_coupon", "trend_use_coupon", "trend_total_coupon", "trend_total_buy", \ "trend_act_ratio_0", "trend_act_ratio_1", "trend_act_ratio_2", \ "trend_used_ratio", "trend_unused_ratio", "trend_buy_with_coupon_ratio", "trend_buy_without_coupon_ratio", \ "max_unused_coupon_month", "max_buy_without_coupon_month", "max_use_coupon_month", "max_total_coupon_month", "max_total_buy_month", \ "avg_unused_coupon_month", "avg_buy_without_coupon_month", "avg_use_coupon_month", "avg_total_coupon_month", "avg_total_buy_month", \ "var_unused_coupon_month", "var_buy_without_coupon_month", "var_use_coupon_month", "var_total_coupon_month", "var_total_buy_month"] ndata = len(user_act_counts.get_keys()) user_act_table = Matrix(np.zeros( (ndata, len(user_col_names))), user_col_names, ["%s" for i in xrange(len(user_col_names))]) col_index = 0 for uid in user_act_counts.get_keys(): acts = user_act_counts.get(uid) unused_coupon = np.sum(acts[:, 0]) buy_without_coupon = np.sum(acts[:, 1]) use_coupon = np.sum(acts[:, 2]) total_coupon = unused_coupon + use_coupon total_buy = buy_without_coupon + use_coupon total_acts = unused_coupon + buy_without_coupon + use_coupon act_ratio_0 = divide(unused_coupon, total_acts) act_ratio_1 = divide(buy_without_coupon, total_acts) act_ratio_2 = divide(use_coupon, total_acts) used_ratio = divide(use_coupon, total_coupon) unused_ratio = divide(unused_coupon, total_coupon) buy_with_coupon_ratio = divide(use_coupon, total_buy) buy_without_coupon_ratio = divide(buy_without_coupon, total_buy) user_act_table.set_row(col_index, np.hstack((np.array([uid, unused_coupon, buy_without_coupon, use_coupon, total_coupon, total_buy, \ act_ratio_0, act_ratio_1, act_ratio_2, \ used_ratio, unused_ratio, buy_with_coupon_ratio, buy_without_coupon_ratio]), calc_trend_and_max_avg_var(acts)))) col_index += 1 user_act_table.check_point("user_act_counts_month_{0}{1}".format( since, to))
def aggregate_shops_by_month(month): def divide(x, y): if y == 0: return 0 else: return x * 1.0 / y col_names = np.genfromtxt( paths.my_path + 'act_counts_month_{0}_col_names.csv'.format(month), delimiter=',', dtype=str) full_act_counts = Matrix( np.genfromtxt(paths.my_path + 'act_counts_month_{0}.csv'.format(month), delimiter=',', dtype=float), col_names[0, :], col_formats=["%s" for i in xrange(col_names.shape[1])]) user_act_counts = HashSet() for i in xrange(full_act_counts.ndata): uid_str = full_act_counts.get_cell(i, "uid") user_acts = user_act_counts.get( uid_str, { "unused_coupons": [], "buy_without_coupons": [], "use_coupons": [], "total_coupons": [], "total_buys": [] }) user_acts["unused_coupons"].append( full_act_counts.get_cell(i, "unused_coupon")) user_acts["buy_without_coupons"].append( full_act_counts.get_cell(i, "buy_without_coupon")) user_acts["use_coupons"].append( full_act_counts.get_cell(i, "use_coupon")) user_acts["total_coupons"].append( full_act_counts.get_cell(i, "total_coupon")) user_acts["total_buys"].append(full_act_counts.get_cell( i, "total_buy")) all_users = user_act_counts.get_keys() user_col_names = ["uid", \ "unused_coupon", "buy_without_coupon", "use_coupon", "total_coupon", "total_buy", \ "act_ratio_0", "act_ratio_1", "act_ratio2", \ "used_ratio", "unused_ratio", "buy_with_coupon_ratio", "buy_without_coupon_ratio", \ "unused_coupon_shop_number", "buy_without_coupon_shop_number", "use_coupon_shop_number", "pure_unused_coupon_shop_number", "pure_no_unused_coupon_shop_number", \ "max_unused_coupon_shop", "max_buy_without_coupon_shop", "max_use_coupon_shop", "max_total_coupon_shop", "max_total_buy_shop", \ "avg_unused_coupon_shop", "avg_buy_without_coupon_shop", "avg_use_coupon_shop", "avg_total_coupon_shop", "avg_total_buy_shop", \ "avg_unused_coupon_shop_nonzero", "avg_buy_without_coupon_shop_nonzero", "avg_use_coupon_shop_nonzero", "avg_total_coupon_nonzero", "avg_total_buy_nonzero", \ "var_unused_coupon_shop", "var_buy_without_coupon_shop", "var_use_coupon_shop", "var_total_coupon_shop", "var_total_buy_shop", \ "var_unused_coupon_shop_nonzero", "var_buy_without_coupon_shop_nonzero", "var_use_coupon_shop_nonzero", "var_total_coupon_shop_nonzero", "var_total_buy_shop_nonzero", \ "mid_unused_coupon_shop_nonzero", "mid_buy_without_coupon_shop_nonzero", "mid_use_coupon_shop_nonzero", "mid_total_coupon_shop_nonzero", "mid_total_buy_shop_nonzero", \ "min_unused_coupon_shop_nonzero", "min_buy_without_coupon_shop_nonzero", "min_use_coupon_shop_nonzero", "min_total_coupon_shop_nonzero", "min_total_buy_shop_nonzero"] user_table = Matrix( np.zeros((len(all_users), len(user_col_names))), user_col_names, col_formats=["%s" for i in xrange(len(user_col_names))]) row_index = 0 for uid in user_act_counts.get_keys(): if row_index % 10000 == 0: print row_index user_acts = user_act_counts.get(uid) unused_coupons = np.array(user_acts["unused_coupons"]) buy_without_coupons = np.array(user_acts["buy_without_coupons"]) use_coupons = np.array(user_acts["use_coupons"]) total_coupons = np.array(user_acts['total_coupons']) total_buys = np.array(user_acts["total_buys"]) unused_coupon = np.sum(unused_coupons) buy_without_coupon = np.sum(buy_without_coupons) use_coupon = np.sum(use_coupons) total_coupon = np.sum(total_coupons) total_buy = np.sum(total_buys) total_acts = unused_coupon + buy_without_coupon + use_coupon act_ratio_0 = divide(unused_coupon, total_acts) act_ratio_1 = divide(buy_without_coupon, total_acts) act_ratio_2 = divide(use_coupon, total_acts) used_ratio = divide(use_coupon, total_coupon) unused_ratio = divide(unused_coupon, total_coupon) buy_with_coupon_ratio = divide(use_coupon, total_buy) buy_without_coupon_ratio = divide(buy_without_coupon, total_buy) unused_coupon_shop_number = np.sum(unused_coupons > 0) buy_without_coupon_shop_number = np.sum(buy_without_coupons > 0) use_coupon_shop_number = np.sum(use_coupons > 0) pure_unused_coupon_shop_number = len( unused_coupons) - use_coupon_shop_number pure_no_unused_coupon_shop_number = len( unused_coupons) - unused_coupon_shop_number max_unused_coupon_shop = np.max(unused_coupons) max_buy_without_coupon_shop = np.max(buy_without_coupons) max_use_coupon_shop = np.max(use_coupons) max_total_coupon_shop = np.max(total_coupons) max_total_buy_shop = np.max(total_buy) avg_unused_coupon_shop = np.mean(unused_coupons) avg_buy_without_coupon_shop = np.mean(buy_without_coupons) avg_use_coupon_shop = np.mean(use_coupons) avg_total_coupon_shop = np.mean(total_coupons) avg_total_buy_shop = np.mean(total_buys) if unused_coupon > 0: avg_unused_coupon_shop_nonzero = np.mean( unused_coupons[np.nonzero(unused_coupons)]) else: avg_unused_coupon_shop_nonzero = 0 if buy_without_coupon > 0: avg_buy_without_coupon_shop_nonzero = np.mean( buy_without_coupons[np.nonzero(buy_without_coupons)]) else: avg_buy_without_coupon_shop_nonzero = 0 if use_coupon > 0: avg_use_coupon_shop_nonzero = np.mean( use_coupons[np.nonzero(use_coupons)]) else: avg_use_coupon_shop_nonzero = 0 if total_coupon > 0: avg_total_coupon_shop_nonzero = np.mean( total_coupons[np.nonzero(total_coupons)]) else: avg_total_coupon_shop_nonzero = 0 if total_buy > 0: avg_total_buy_shop_nonzero = np.mean( total_buys[np.nonzero(total_buys)]) else: avg_total_buy_shop_nonzero = 0 var_unused_coupon_shop = np.var(unused_coupons) var_buy_without_coupon_shop = np.var(buy_without_coupons) var_use_coupon_shop = np.var(use_coupons) var_total_coupon_shop = np.var(total_coupons) var_total_buy_shop = np.var(total_buys) if unused_coupon > 0: var_unused_coupon_shop_nonzero = np.var( unused_coupons[np.nonzero(unused_coupons)]) else: var_unused_coupon_shop_nonzero = 0 if buy_without_coupon > 0: var_buy_without_coupon_shop_nonzero = np.var( buy_without_coupons[np.nonzero(buy_without_coupons)]) else: var_buy_without_coupon_shop_nonzero = 0 if use_coupon > 0: var_use_coupon_shop_nonzero = np.var( use_coupons[np.nonzero(use_coupons)]) else: var_use_coupon_shop_nonzero = 0 if total_coupon > 0: var_total_coupon_shop_nonzero = np.var( total_coupons[np.nonzero(total_coupons)]) else: var_total_coupon_shop_nonzero = 0 if total_buy > 0: var_total_buy_shop_nonzero = np.var( total_buys[np.nonzero(total_buys)]) else: var_total_buy_shop_nonzero = 0 if unused_coupon > 0: mid_unused_coupon_shop_nonzero = np.median( unused_coupons[np.nonzero(unused_coupons)]) else: mid_unused_coupon_shop_nonzero = 0 if buy_without_coupon > 0: mid_buy_without_coupon_shop_nonzero = np.median( buy_without_coupons[np.nonzero(buy_without_coupons)]) else: mid_buy_without_coupon_shop_nonzero = 0 if use_coupon > 0: mid_use_coupon_shop_nonzero = np.median( use_coupons[np.nonzero(use_coupons)]) else: mid_use_coupon_shop_nonzero = 0 if total_coupon > 0: mid_total_coupon_shop_nonzero = np.median( total_coupons[np.nonzero(total_coupons)]) else: mid_total_coupon_shop_nonzero = 0 if total_buy > 0: mid_total_buy_shop_nonzero = np.median( total_buys[np.nonzero(total_buys)]) else: mid_total_buy_shop_nonzero = 0 if unused_coupon > 0: min_unused_coupon_shop_nonzero = np.min( unused_coupons[np.nonzero(unused_coupons)]) else: min_unused_coupon_shop_nonzero = 0 if buy_without_coupon > 0: min_buy_without_coupon_shop_nonzero = np.min( buy_without_coupons[np.nonzero(buy_without_coupons)]) else: min_buy_without_coupon_shop_nonzero = 0 if use_coupon > 0: min_use_coupon_shop_nonzero = np.min( use_coupons[np.nonzero(use_coupons)]) else: min_use_coupon_shop_nonzero = 0 if total_coupon > 0: min_total_coupon_shop_nonzero = np.min( total_coupons[np.nonzero(total_coupons)]) else: min_total_coupon_shop_nonzero = 0 if total_buy > 0: min_total_buy_shop_nonzero = np.min( total_buys[np.nonzero(total_buys)]) else: min_total_buy_shop_nonzero = 0 user_table.set_row(row_index, np.array([uid, \ unused_coupon, buy_without_coupon, use_coupon, total_coupon, total_buy, \ act_ratio_0, act_ratio_1, act_ratio_2, \ used_ratio, unused_ratio, buy_with_coupon_ratio, buy_without_coupon_ratio, \ unused_coupon_shop_number, buy_without_coupon_shop_number, use_coupon_shop_number, pure_unused_coupon_shop_number, pure_no_unused_coupon_shop_number, \ max_unused_coupon_shop, max_buy_without_coupon_shop, max_use_coupon_shop, max_total_coupon_shop, max_total_buy_shop, \ avg_unused_coupon_shop, avg_buy_without_coupon_shop, avg_use_coupon_shop, avg_total_coupon_shop, avg_total_buy_shop, \ avg_unused_coupon_shop_nonzero, avg_buy_without_coupon_shop_nonzero, avg_use_coupon_shop_nonzero, avg_total_coupon_shop_nonzero, avg_total_buy_shop_nonzero, \ var_unused_coupon_shop, var_buy_without_coupon_shop, var_use_coupon_shop, var_total_coupon_shop, var_total_buy_shop, \ var_unused_coupon_shop_nonzero, var_buy_without_coupon_shop_nonzero, var_use_coupon_shop_nonzero, var_total_coupon_shop_nonzero, var_total_buy_shop_nonzero, \ mid_unused_coupon_shop_nonzero, mid_buy_without_coupon_shop_nonzero, mid_use_coupon_shop_nonzero, mid_total_coupon_shop_nonzero, mid_total_buy_shop_nonzero, \ min_unused_coupon_shop_nonzero, min_buy_without_coupon_shop_nonzero, min_use_coupon_shop_nonzero, min_total_coupon_shop_nonzero, min_total_buy_shop_nonzero])) row_index += 1 assert row_index == len(all_users) user_table.check_point("user_act_counts_month_{0}".format(month))
def calc_first_used(month): offline = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_test_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) user_coupon = HashSet() user_total_used = HashSet() user_first_used = HashSet() user_first_coupon = HashSet() for i in xrange(offline.ndata): if i % 10000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") date_str = offline.get_cell(i, "date") date_rec_str = offline.get_cell(i, "date_rec") cid_str = offline.get_cell(i, "cid") if cid_str == 'null': continue user_coupon.add_one(uid_str) user_first_coupon_list = user_first_coupon.get(uid_str, HashSet()) if not user_first_coupon_list.has(mid_str): if date_str != 'null': user_first_coupon_list.set(mid_str, (date_rec_str, 1)) else: user_first_coupon_list.set(mid_str, (date_rec_str, 0)) else: old_first_date_rec, old_used = user_first_coupon_list.get(mid_str) if date_rec_str < old_first_date_rec: if date_str != 'null': user_first_coupon_list.set(mid_str, (date_rec_str, 1)) else: user_first_coupon_list.set(mid_str, (date_rec_str, 0)) if date_str != 'null': user_total_used.add_one(uid_str) col_names = [ "user_coupon", "user_coupon_used", "user_first_coupon", "user_first_coupon_used", "user_coupon_used_ratio", "user_first_coupon_used_ratio" ] statistics = np.zeros((offline.ndata, len(col_names))) for i in xrange(offline.ndata): if i % 10000 == 0: print i uid_str = offline.get_cell(i, "uid") user_first_used_counter = 0 user_first_counter = 0 user_first_coupon_list = user_first_coupon.get(uid_str, HashSet()) for mid in user_first_coupon_list.get_keys(): user_first_counter += 1 first_date_rec, used = user_first_coupon_list.get(mid) if used == 1: user_first_used_counter += 1 user_coupon_counter = user_coupon.get(uid_str, 0) user_total_used_counter = user_total_used.get(uid_str, 0) if user_coupon_counter > 0: ratio = user_total_used_counter * 1.0 / user_coupon_counter else: ratio = 0 if user_first_counter > 0: first_ratio = user_first_used_counter * 1.0 / user_first_counter statistics[i, :] = [ user_coupon_counter, user_total_used_counter, user_first_counter, user_first_used_counter, ratio, first_ratio ] offline.cat_col(statistics, col_names, ["%s" for i in xrange(len(col_names))]) offline.check_point("fisrt_use_{0}".format(month))
def calc_user_shop_month(): offline = Matrix(np.genfromtxt(paths.ccf_path + 'ccf_offline_stage1_train.csv', delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) user_shop_month = HashSet() shop_month_coupon = {} shop_month_coupon_used = {} shop_coupon = {} shop_coupon_used = {} for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") date_rec_str = offline.get_cell(i, "date_rec") if date_rec_str == 'null': continue month = get_month(date_rec_str) user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) if month not in user_shop_month_set: user_shop_month_set[month] = 1 else: user_shop_month_set[month] += 1 for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") date_rec_str = offline.get_cell(i, "date_rec") date_str = offline.get_cell(i, "date") cid_str = offline.get_cell(i, "cid") if date_rec_str == 'null': continue assert cid_str != 'null' month = get_month(date_rec_str) user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) if mid_str not in shop_coupon: shop_coupon[mid_str] = 1 else: shop_coupon[mid_str] += 1 if date_str != 'null': if mid_str not in shop_coupon_used: shop_coupon_used[mid_str] = 1 else: shop_coupon_used[mid_str] += 1 if month in user_shop_month_set and user_shop_month_set[month] > 1: if mid_str not in shop_month_coupon: shop_month_coupon[mid_str] = 1 else: shop_month_coupon[mid_str] += 1 if date_str != 'null': if mid_str not in shop_month_coupon_used: shop_month_coupon_used[mid_str] = 1 else: shop_month_coupon_used[mid_str] += 1 col_names = ["user_shop_month", "shop_coupon", "shop_coupon_used", "shop_month_coupon", "shop_month_coupon_used"] statistics = np.zeros((offline.ndata, 5), dtype=float) for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") mid_str = offline.get_cell(i, "mid") user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) statistics[i, :] = np.array([user_shop_month_set.get(month, 0), shop_coupon.get(mid_str, 0), \ shop_coupon_used.get(mid_str, 0), shop_month_coupon.get(mid_str, 0), shop_month_coupon_used.get(mid_str, 0)]) offline.cat_col(statistics, col_names, ["%s" for i in xrange(len(col_names))]) offline.check_point("shop_month_coupon")