def gen_user_shop_tags(path_to_user_tags_csv, path_to_shop_tags_csv, matrix): utags = HashSet( np.genfromtxt(path_to_user_tags_csv, delimiter=',', dtype=float)) stags = HashSet( np.genfromtxt(path_to_shop_tags_csv, delimiter=',', dtype=float)) matrix.join("uid", ["u_tag_{0}".format(i) for i in xrange(utags.row_dim)], utags, ["%s" for i in xrange(utags.row_dim)]) matrix.join("mid", ["m_tag_{0}".format(i) for i in xrange(stags.row_dim)], stags, ["%s" for i in xrange(stags.row_dim)])
def gen_user_shop_features(path_to_user_features_csv, path_to_shop_features_csv, matrix): ufeatures = HashSet( np.genfromtxt(path_to_user_features_csv, delimiter=',', dtype=float)) sfeatures = HashSet( np.genfromtxt(path_to_shop_features_csv, delimiter=',', dtype=float)) matrix.join_op("uid", "mid", "score2", ufeatures, sfeatures, lambda x, y: np.sum(np.array(x) * np.array(y)), "%s") matrix.join("uid", [ "user_cofi_features_auto{0}".format(i) for i in xrange(ufeatures.row_dim) ], ufeatures, ["%s" for i in xrange(ufeatures.row_dim)]) matrix.join("mid", [ "shop_cofi_features_auto{0}".format(i) for i in xrange(sfeatures.row_dim) ], sfeatures, ["%s" for i in xrange(sfeatures.row_dim)])
def gen_user_buy_shop(since, to, month): assert to == month - 1 offline = Matrix(np.genfromtxt('offline_{0}_month_{1}{2}.csv'.format(to - since + 1, since, to), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix(np.genfromtxt('offline_{0}_month_{1}{2}.csv'.format(to - since + 1, since, to), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) col_names = np.genfromtxt(paths.my_path + 'col_names_{0}_{1}{2}.csv'.format(month, since, to), delimiter=',', dtype=str) col_number = col_names.shape[1] X = Matrix(np.genfromtxt(paths.my_path + 'X_{0}_{1}{2}.csv'.format(month, since, to), delimiter=',', dtype=str), list(col_names[0, :]), ["%s" for i in xrange(col_names.shape[1])]) user_buy_shop = HashSet() for i in xrange(offline.ndata):
def gen_no_penalty_user_shop_features(X, month): ufeatures = HashSet( np.genfromtxt(paths.my_path + 'u_features_no_penalty_10_{0}.csv'.format(month), delimiter=',', dtype=float)) sfeatures = HashSet( np.genfromtxt(paths.my_path + 'i_features_no_penalty_10_{0}.csv'.format(month), delimiter=',', dtype=float)) X.join_op("uid", "mid", "score1", ufeatures, sfeatures, lambda x, y: np.sum(np.array(x) * np.array(y)), "%s") X.join("uid", [ "user_cofi_features_no_penalty{0}".format(i) for i in xrange(ufeatures.row_dim) ], ufeatures, ["%s" for i in xrange(ufeatures.row_dim)]) X.join("mid", [ "shop_cofi_features_no_penalty{0}".format(i) for i in xrange(sfeatures.row_dim) ], sfeatures, ["%s" for i in xrange(sfeatures.row_dim)])
def gen_user_get_coupon(offline_source, online_source, X, month): offline = Matrix( np.genfromtxt(paths.ccf_path + offline_source, delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + online_source, delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) user_get_coupon = HashSet() for i in xrange(offline.ndata): uid_str = offline.get_cell(i, "uid") cid_str = offline.get_cell(i, "cid") if cid_str != 'null': user_get_coupon.add_one(uid_str) for i in xrange(online.ndata): uid_str = online.get_cell(i, "uid") cid_str = online.get_cell(i, "cid") act_str = online.get_cell(i, "act") if cid_str != 'null': assert act_str != '0' user_get_coupon.add_one(uid_str) X.join("uid", ["user_get_coupon"], user_get_coupon, ["%s"], dft=0.0) def divide(x, y): assert float(x) != -9999 and float(y) != -9999 assert float(x) <= float(y), "{0} {1}".format(x, y) if float(y) == 0: return 0.0 else: return float(x) * 1.0 / float(y) X.gen_arith_feature("user_buy_with_coupon", "user_get_coupon", "user_use_coupon_freq", divide, "%s", dft=0.0)
def gen_merchant_share(X, month): offline = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) merchant_user_buy = HashSet() merchant_user_use_coupon = HashSet() merchant_user_buy_counter = HashSet() merchant_user_use_coupon_counter = HashSet() for i in xrange(offline.ndata): if i % 100000 == 0: print i mid_str = offline.get_cell(i, "mid") uid_str = offline.get_cell(i, "uid") date_str = offline.get_cell(i, "date") cid_str = offline.get_cell(i, "cid") if date_str != 'null': if not merchant_user_buy.has(mid_str): merchant_user_buy.set(mid_str, HashSet()) if not merchant_user_buy.get(mid_str).has(uid_str): merchant_user_buy.get(mid_str).add_one(uid_str) merchant_user_buy_counter.add_one(mid_str) if cid_str != 'null': if not merchant_user_use_coupon.has(mid_str): merchant_user_use_coupon.set(mid_str, HashSet()) if not merchant_user_use_coupon.get(mid_str).has(uid_str): merchant_user_use_coupon.get(mid_str).add_one(uid_str) merchant_user_use_coupon_counter.add_one(mid_str) for i in xrange(online.ndata): if i % 100000 == 0: print i mid_str = online.get_cell(i, "mid") uid_str = online.get_cell(i, "uid") act_str = online.get_cell(i, "act") cid_str = online.get_cell(i, "cid") if act_str == '1': if not merchant_user_buy.has(mid_str): merchant_user_buy.set(mid_str, HashSet()) if not merchant_user_buy.get(mid_str).has(uid_str): merchant_user_buy.get(mid_str).add_one(uid_str) merchant_user_buy_counter.add_one(mid_str) if cid_str != 'null': if not merchant_user_use_coupon.has(mid_str): merchant_user_use_coupon.set(mid_str, HashSet()) if not merchant_user_use_coupon.get(mid_str).has(uid_str): merchant_user_use_coupon.get(mid_str).add_one(uid_str) merchant_user_use_coupon_counter.add_one(mid_str) X.join("mid", ["merchant_user_buy"], merchant_user_buy_counter, ["%s"], dft=0.0) X.join("mid", ["merchant_user_use_coupon"], merchant_user_use_coupon_counter, ["%s"], dft=0.0) X.check_point(month)
def gen_merchant_buy(X, month): offline = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) merchant_buy = HashSet() merchant_buy_with_coupon = HashSet() merchant_distribute_coupon = HashSet() for i in xrange(offline.ndata): if i % 100000 == 0: print i mid_str = offline.get_cell(i, "mid") date_str = offline.get_cell(i, "date") cid_str = offline.get_cell(i, "cid") if date_str != 'null': merchant_buy.add_one(mid_str) if cid_str != 'null': merchant_buy_with_coupon.add_one(mid_str) if cid_str != 'null': merchant_distribute_coupon.add_one(mid_str) for i in xrange(online.ndata): if i % 100000 == 0: print i mid_str = online.get_cell(i, "mid") act_str = online.get_cell(i, "act") cid_str = online.get_cell(i, "cid") if act_str == '1': merchant_buy.add_one(mid_str) if cid_str != 'null': merchant_buy_with_coupon.add_one(mid_str) if cid_str != 'null': assert act_str != '0' merchant_distribute_coupon.add_one(mid_str) X.join("mid", ["merchant_buy"], merchant_buy, ["%s"], 0.0) X.join("mid", ["merchant_buy_with_coupon"], merchant_buy_with_coupon, ["%s"], 0.0) X.join("mid", ["merchant_distribute_coupon"], merchant_distribute_coupon, ["%s"], 0.0) def divide(x, y): assert float(x) <= float(y) if float(y) == 0: return 0.0 else: return float(x) * 1.0 / float(y) X.gen_arith_feature("merchant_buy_with_coupon", "merchant_buy", "merchant_buy_with_coupon_ratio", divide, "%s", dft=0.0) X.gen_arith_feature("merchant_buy_with_coupon", "merchant_distribute_coupon", "merchant_coupon_ratio", divide, "%s", dft=0.0) X.check_point(month)
def gen_user_recent_behavior(X, month): offline = Matrix( np.genfromtxt(paths.ccf_path + 'offline_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + 'online_train_{0}.csv'.format(month), delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) behaviors = { 1: HashSet(), 2: HashSet(), 3: HashSet(), 4: HashSet(), 5: HashSet(), 6: HashSet() } for i in xrange(offline.ndata): if i % 100000 == 0: print i uid_str = offline.get_cell(i, "uid") date_str = offline.get_cell(i, "date") cid_str = offline.get_cell(i, "cid") date_rec_str = offline.get_cell(i, "date_rec") if date_str != 'null': act_month = get_month(date_str) if not behaviors[act_month].has(uid_str): behaviors[act_month].set(uid_str, np.zeros(3)) if cid_str != 'null': behaviors[act_month].get(uid_str)[2] += 1 else: behaviors[act_month].get(uid_str)[1] += 1 else: act_month = get_month(date_rec_str) if not behaviors[act_month].has(uid_str): behaviors[act_month].set(uid_str, np.zeros(3)) behaviors[act_month].get(uid_str)[0] += 1 for i in xrange(online.ndata): if i % 100000 == 0: print i uid_str = online.get_cell(i, "uid") date_str = online.get_cell(i, "date") cid_str = online.get_cell(i, "cid") date_rec_str = online.get_cell(i, "date_rec") act_str = online.get_cell(i, "act") if act_str == '0': continue if date_str != 'null': assert act_str == '1' act_month = get_month(date_str) if not behaviors[act_month].has(uid_str): behaviors[act_month].set(uid_str, np.zeros(3)) if cid_str != 'null': behaviors[act_month].get(uid_str)[2] += 1 else: behaviors[act_month].get(uid_str)[1] += 1 else: assert act_str == '2' act_month = get_month(date_rec_str) if not behaviors[act_month].has(uid_str): behaviors[act_month].set(uid_str, np.zeros(3)) behaviors[act_month].get(uid_str)[0] += 1 def gen_user_recent_behavior_func(row): date_rec_str = row['date_rec'] uid_str = row['uid'] act_month = get_month(date_rec_str) behavior_history = np.zeros(0) for i in xrange(4): behavior_vector = np.array([-9999, -9999, -9999]) if date_rec_str != 'null' and act_month - 1 - i > 0 and behaviors[ act_month - 1 - i].has(uid_str): behavior_vector = behaviors[act_month - 1 - i].get(uid_str) elif act_month - 1 - i > 0 and date_rec_str != 'null': behavior_vector = np.array([0, 0, 0]) behavior_history = np.hstack((behavior_history, behavior_vector)) return behavior_history names = [] for i in xrange(4): names += [ "recent_user_unuse{0}".format(i), "recent_user_buy{0}".format(i), "recent_user_use{0}".format(i) ] X.gen_features(names, gen_user_recent_behavior_func, ["%s" for i in xrange(12)]) X.check_point(month)
def gen_user_buy_in_shop(matrix_offline, matrix_online, X): user_buy_in_shop = HashSet(default=matrix_offline.default) for i in xrange(matrix_offline.ndata): date_str = matrix_offline.get_cell(i, 'date') uid_str = matrix_offline.get_cell(i, 'uid') mid_str = matrix_offline.get_cell(i, "mid") if date_str != 'null': if not user_buy_in_shop.has(uid_str): user_buy_in_shop.set(uid_str, HashSet()) user_buy_in_shop.get(uid_str).add_one(mid_str) for i in xrange(matrix_online.ndata): act_str = matrix_online.get_cell(i, 'act') uid_str = matrix_online.get_cell(i, 'uid') mid_str = matrix_online.get_cell(i, "mid") if act_str == '1': if not user_buy_in_shop.has(uid_str): user_buy_in_shop.set(uid_str, HashSet()) user_buy_in_shop.get(uid_str).add_one(mid_str) X.join_by_double_key("uid", "mid", "user_buy_in_shop", user_buy_in_shop, "%s", dft=0.0) def divide(x, y): assert float(x) <= float(y) if float(y) == 0: return 0 else: return float(x) * 1.0 / float(y) X.gen_arith_feature("user_buy_in_shop", "user_buy", "user_buy_in_shop_ratio", divide, fmt="%s", dft=0.0)
def gen_user_buy_with_coupon(matrix_offline, matrix_online, X): user_buy = HashSet(default=matrix_offline.default) user_buy_with_coupon = HashSet(default=matrix_offline.default) for i in xrange(matrix_offline.ndata): cid_str = matrix_offline.get_cell(i, 'cid') date_str = matrix_offline.get_cell(i, 'date') uid_str = matrix_offline.get_cell(i, 'uid') if date_str != 'null': user_buy.add_one(uid_str) if cid_str != 'null': user_buy_with_coupon.add_one(uid_str) for i in xrange(matrix_online.ndata): act_str = matrix_online.get_cell(i, 'act') cid_str = matrix_online.get_cell(i, 'cid') uid_str = matrix_online.get_cell(i, 'uid') if act_str == '1': user_buy.add_one(uid_str) if cid_str != 'null': user_buy_with_coupon.add_one(uid_str) user_buy_with_coupon_freq = user_buy.merge_op( user_buy_with_coupon, lambda x, y: float(y) * 1.0 / float(x), dft=0.0) X.join("uid", ["user_buy", "user_buy_with_coupon", "user_buy_with_coupon_freq"], user_buy.merge(user_buy_with_coupon, dft=0.0).merge(user_buy_with_coupon_freq, dft=0.0), ("%s" for i in xrange(3)), dft=0.0)
def gen_user_get_shop_coupon(offline_source, online_source, X): offline = Matrix( np.genfromtxt(paths.ccf_path + offline_source, delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) online = Matrix( np.genfromtxt(paths.ccf_path + online_source, delimiter=',', dtype=str), ["uid", "mid", "act", "cid", "dis_rate", "date_rec", "date"], ["%s" for i in xrange(7)]) user_get_shop_coupon = HashSet() for i in xrange(offline.ndata): uid_str = offline.get_cell(i, "uid") cid_str = offline.get_cell(i, "cid") mid_str = offline.get_cell(i, "mid") if cid_str != 'null': if not user_get_shop_coupon.has(uid_str): user_get_shop_coupon.set(uid_str, HashSet()) user_get_shop_coupon.get(uid_str).add_one(mid_str) for i in xrange(online.ndata): uid_str = online.get_cell(i, "uid") cid_str = online.get_cell(i, "cid") mid_str = online.get_cell(i, "mid") act_str = online.get_cell(i, "act") if cid_str != 'null' and act_str != '0': if not user_get_shop_coupon.has(uid_str): user_get_shop_coupon.set(uid_str, HashSet()) user_get_shop_coupon.get(uid_str).add_one(mid_str) X.join_by_double_key("uid", "mid", "user_get_shop_coupon", user_get_shop_coupon, "%s", dft=0.0) def divide(x, y): assert float(x) <= float(y) if float(y) == 0: return 0.0 else: return float(x) * 1.0 / float(y) X.gen_arith_feature("user_buy_with_coupon_in_shop", "user_get_shop_coupon", "user_use_shop_coupon_freq", divide, "%s", dft=0.0)
def gen_user_buy_coupon_in_shop(matrix_offline, matrix_online, X): user_buy_with_coupon_in_shop = HashSet() user_buy_without_coupon_in_shop = HashSet() for i in xrange(matrix_offline.ndata): uid_str = matrix_offline.get_cell(i, "uid") cid_str = matrix_offline.get_cell(i, "cid") date_str = matrix_offline.get_cell(i, "date") mid_str = matrix_offline.get_cell(i, "mid") if date_str != 'null': if not user_buy_with_coupon_in_shop.has(uid_str): user_buy_with_coupon_in_shop.set(uid_str, HashSet()) user_buy_without_coupon_in_shop.set(uid_str, HashSet()) if cid_str != 'null': user_buy_with_coupon_in_shop.get(uid_str).add_one(mid_str) else: user_buy_without_coupon_in_shop.get(uid_str).add_one(mid_str) for i in xrange(matrix_online.ndata): uid_str = matrix_online.get_cell(i, "uid") cid_str = matrix_online.get_cell(i, "cid") mid_str = matrix_online.get_cell(i, "mid") act_str = matrix_online.get_cell(i, "act") if act_str == '1': if not user_buy_with_coupon_in_shop.has(uid_str): user_buy_with_coupon_in_shop.set(uid_str, HashSet()) user_buy_without_coupon_in_shop.set(uid_str, HashSet()) if cid_str != 'null': user_buy_with_coupon_in_shop.get(uid_str).add_one(mid_str) else: user_buy_without_coupon_in_shop.get(uid_str).add_one(mid_str) X.join_by_double_key("uid", "mid", "user_buy_with_coupon_in_shop", user_buy_with_coupon_in_shop, "%s", dft=0.0) X.join_by_double_key("uid", "mid", "user_buy_without_coupon_in_shop", user_buy_without_coupon_in_shop, "%s", dft=0.0) def divide(x, y): assert float(x) <= float(y) if float(y) == 0: return 0 else: return float(x) * 1.0 / float(y) X.gen_arith_feature("user_buy_with_coupon_in_shop", "user_buy_in_shop", "user_buy_with_coupon_in_shop_ratio", divide, "%s", dft=0.0) X.gen_arith_feature("user_buy_without_coupon_in_shop", "user_buy_in_shop", "user_buy_without_coupon_in_shop_ratio", divide, "%s", dft=0.0)
def gen_coupon_gap(X, month): get_coupon_history = HashSet() get_coupon_history_user = HashSet() for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") date_rec_str = X.get_cell(i, "date_rec") if date_rec_str != 'null': history_list = get_coupon_history.get(uid_str, HashSet()).get(mid_str, []) #if date_rec_str not in history_list: history_list.append(date_rec_str) user_history_list = get_coupon_history_user.get(uid_str, []) #if date_rec_str not in user_history_list: user_history_list.append(date_rec_str) if month > 1: last_X = Matrix( np.genfromtxt(paths.ccf_path + 'offline_1_month{0}.csv'.format(month - 1), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) for i in xrange(last_X.ndata): if i % 100000 == 0: print i uid_str = last_X.get_cell(i, "uid") mid_str = last_X.get_cell(i, "mid") date_rec_str = last_X.get_cell(i, "date_rec") if date_rec_str != 'null': history_list = get_coupon_history.get(uid_str, HashSet()).get( mid_str, []) #if date_rec_str not in history_list: history_list.append(date_rec_str) user_history_list = get_coupon_history_user.get(uid_str, []) #if date_rec_str not in user_history_list: user_history_list.append(date_rec_str) if month < 7: next_X = Matrix( np.genfromtxt(paths.ccf_path + 'offline_1_month{0}.csv'.format(month + 1), delimiter=',', dtype=str), ["uid", "mid", "cid", "dis_rate", "dist", "date_rec", "date"], ["%s" for i in xrange(7)]) for i in xrange(next_X.ndata): if i % 100000 == 0: print i uid_str = next_X.get_cell(i, "uid") mid_str = next_X.get_cell(i, "mid") date_rec_str = next_X.get_cell(i, "date_rec") if date_rec_str != 'null': history_list = get_coupon_history.get(uid_str, HashSet()).get( mid_str, []) #if date_rec_str not in history_list: history_list.append(date_rec_str) user_history_list = get_coupon_history_user.get(uid_str, []) #if date_rec_str not in user_history_list: user_history_list.append(date_rec_str) for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") history_list = get_coupon_history.get(uid_str, HashSet()).get(mid_str, []) get_coupon_history.get(uid_str).set(mid_str, np.sort(history_list)) user_history_list = get_coupon_history_user.get(uid_str, []) get_coupon_history_user.set(uid_str, np.sort(user_history_list)) col_names = [ "prev_gap", "next_gap", "user_prev_gap", "user_next_gap", "in_between", "prev_gap_prev", "next_gap_prev" ] gaps = np.zeros((X.ndata, len(col_names))) for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") date_rec_str = X.get_cell(i, "date_rec") history_list = get_coupon_history.get(uid_str).get(mid_str) user_history_list = get_coupon_history_user.get(uid_str) k = -1 for j in xrange(len(history_list)): ##if history_list[j] == date_rec_str: if j == len(history_list) - 1 or history_list[j + 1] > date_rec_str: break else: k += 1 in_between = 1 if k > 0: prev_gap = days_dis(history_list[k], date_rec_str) else: prev_gap = 0 in_between = 0 if k + 2 < len(history_list): next_gap = days_dis(date_rec_str, history_list[k + 2]) else: next_gap = 0 in_between = 0 k = -1 for j in xrange(len(user_history_list)): ##if user_history_list[j] == date_rec_str: if j == len(history_list) - 1 or history_list[j + 1] > date_rec_str: break else: k += 1 if k > 0: user_prev_gap = days_dis(user_history_list[k], date_rec_str) else: user_prev_gap = 0 if k + 2 < len(user_history_list): user_next_gap = days_dis(date_rec_str, user_history_list[k + 2]) else: user_next_gap = 0 k = -1 for j in xrange(len(history_list)): if history_list[j] == date_rec_str: ##if j == len(history_list) - 1 or history_list[j + 1] > date_rec_str: break else: k += 1 if k > 0: prev_gap_prev = days_dis(history_list[k], date_rec_str) else: prev_gap_prev = 0 if k + 2 < len(history_list): next_gap_prev = days_dis(date_rec_str, history_list[k + 2]) else: next_gap_prev = 0 in_between = 0 gaps[i, :] = np.array([ prev_gap, next_gap, user_prev_gap, user_next_gap, in_between, prev_gap_prev, next_gap_prev ]) X.cat_col(gaps, col_names, ["%s" for i in xrange(len(col_names))]) X.check_point("X_{0}_{1}{2}_gap".format(month, 1, month - 1))
def gen_user_shop_month(X, month): user_shop_month = HashSet() shop_month_coupon = {} shop_month_coupon_used = {} shop_coupon = {} shop_coupon_used = {} for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") date_rec_str = X.get_cell(i, "date_rec") if date_rec_str == 'null': continue month = get_month(date_rec_str) user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) if month not in user_shop_month_set: user_shop_month_set[month] = 1 else: user_shop_month_set[month] += 1 """ for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") date_rec_str = X.get_cell(i, "date_rec") date_str = X.get_cell(i, "date") cid_str = X.get_cell(i, "cid") if date_rec_str == 'null': continue assert cid_str != 'null' month = get_month(date_rec_str) user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) if mid_str not in shop_coupon: shop_coupon[mid_str] = 1 else: shop_coupon[mid_str] += 1 if date_str != 'null': if mid_str not in shop_coupon_used: shop_coupon_used[mid_str] = 1 else: shop_coupon_used[mid_str] += 1 if month in user_shop_month_set and user_shop_month_set[month] > 1: if mid_str not in shop_month_coupon: shop_month_coupon[mid_str] = 1 else: shop_month_coupon[mid_str] += 1 if date_str != 'null': if mid_str not in shop_month_coupon_used: shop_month_coupon_used[mid_str] = 1 else: shop_month_coupon_used[mid_str] += 1 """ #col_names = ["user_shop_month_11_2", "shop_coupon_11_2", "shop_coupon_used_11_2", "shop_month_coupon_11_2", "shop_month_coupon_used_11_2", "shop_coupon_used_ratio_11_2", "is_shop_coupon_11_2"] col_names = ["user_shop_month_11_2", "is_shop_coupon_11_2"] statistics = np.zeros((X.ndata, len(col_names)), dtype=float) for i in xrange(X.ndata): if i % 100000 == 0: print i uid_str = X.get_cell(i, "uid") mid_str = X.get_cell(i, "mid") user_shop_month_set = user_shop_month.get(uid_str, HashSet()).get(mid_str, {}) #if shop_month_coupon.get(mid_str, 0) == 0: # ratio = 0 #else: # ratio = shop_month_coupon_used.get(mid_str, 0) * 1.0 / shop_month_coupon.get(mid_str) if user_shop_month_set.get(month, 0) > 1: is_shop_coupon = 1 else: is_shop_coupon = 0 statistics[i, :] = np.array( [user_shop_month_set.get(month, 0), is_shop_coupon]) X.cat_col(statistics, col_names, ["%s" for i in xrange(len(col_names))]) X.check_point("X_{0}_{1}{2}_shop_month".format(month, 1, month - 1))