def getUserPreference1(click_score, collect_score, buy_score): ''' Considering count and time information ''' #data = [feature for feature in csv.reader(open(settings["TRAIN_DATA_FILE"]))] data = [feature for feature in csv.reader(open(settings["TAR_DATA_FILE"]))] data = [map(int, feature) for feature in data[1:]] user_preference_score = defaultdict(dict) user_sorted_result = defaultdict(list) #factor = [1, 1, 1] #factor = [0.2, 0.3, 0.5] #factor = [0.2, 0.3, 0.5] factor = [0.1, 0.15, 0.25, 0.5] for entry in data: uid, pid, action_type, month, day = entry seg_num = calSegNum(month, day) if pid not in user_preference_score[uid]: user_preference_score[uid][pid] = 0 if action_type == settings["ACTION_BUY"]: user_preference_score[uid][pid] += factor[seg_num-1]*buy_score elif action_type == settings["ACTION_COLLECT"] or action_type == settings["ACTION_SHOPPING_CHART"]: user_preference_score[uid][pid] += factor[seg_num-1]*collect_score elif action_type == settings["ACTION_CLICK"]: user_preference_score[uid][pid] += factor[seg_num-1]*click_score for uid in user_preference_score: user_sorted_result[uid] = sorted(user_preference_score[uid].items(), key=lambda x:x[1], reverse=True) return user_sorted_result
def getUserMultipleBehavior(data, month_num): user_buy = {} user_click = {} for entry in data: uid, pid, action_type, month, day = entry if action_type == 0 or action_type == 1: seg_num = calSegNum(month, day) if seg_num == 1: continue if action_type == 0: if uid not in user_click: user_click[uid] = {} if pid not in user_click[uid]: user_click[uid][pid] = 1 else: user_click[uid][pid] += 1 #if user_click[uid][pid] > 20: # user_click[uid][pid] = 20 elif action_type == 1: if uid not in user_buy: user_buy[uid] = [set([]) for i in xrange(month_num)] user_buy[uid][seg_num - 1].add(pid) for uid in user_click: for pid in user_click[uid]: user_click[uid][pid] = np.log(user_click[uid][pid]) return user_buy, user_click
def basicGenTrainPair(data, ratio): pid_set = set([]) user_behavior = defaultdict(list) for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] pid_set.add(pid) if action_type == 1: user_behavior[uid].append([pid, month, day]) pairs = [] targets = [] for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] seg_num = calSegNum(month, day) if action_type == 1 and seg_num > 1: pairs.append([uid, pid, month, day]) uid_history_set = getHistorySet(user_behavior[uid], month, day) neg_pid_set = random.sample(pid_set - uid_history_set, ratio) targets.append([1]) for neg_pid in neg_pid_set: pairs.append([uid, neg_pid, month, day]) targets.append([0]) return pairs, targets
def basicGenTrainPair(data, ratio): pid_set = set([]) user_behavior = defaultdict(list) for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] pid_set.add(pid) if action_type == 1: user_behavior[uid].append([pid, month, day]) pairs = [] targets = [] for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] seg_num = calSegNum(month, day) if action_type == 1 and seg_num > 1: pairs.append([uid, pid, month, day]) uid_history_set = getHistorySet(user_behavior[uid], month, day) neg_pid_set = random.sample(pid_set-uid_history_set, ratio) targets.append([1]) for neg_pid in neg_pid_set: pairs.append([uid, neg_pid, month,day]) targets.append([0]) return pairs, targets
def getUserPreference1(click_score, collect_score, buy_score): ''' Considering count and time information ''' #data = [feature for feature in csv.reader(open(settings["TRAIN_DATA_FILE"]))] data = [feature for feature in csv.reader(open(settings["TAR_DATA_FILE"]))] data = [map(int, feature) for feature in data[1:]] user_preference_score = defaultdict(dict) user_sorted_result = defaultdict(list) #factor = [1, 1, 1] #factor = [0.2, 0.3, 0.5] #factor = [0.2, 0.3, 0.5] factor = [0.1, 0.15, 0.25, 0.5] for entry in data: uid, pid, action_type, month, day = entry seg_num = calSegNum(month, day) if pid not in user_preference_score[uid]: user_preference_score[uid][pid] = 0 if action_type == settings["ACTION_BUY"]: user_preference_score[uid][pid] += factor[seg_num - 1] * buy_score elif action_type == settings[ "ACTION_COLLECT"] or action_type == settings[ "ACTION_SHOPPING_CHART"]: user_preference_score[uid][pid] += factor[seg_num - 1] * collect_score elif action_type == settings["ACTION_CLICK"]: user_preference_score[uid][pid] += factor[seg_num - 1] * click_score for uid in user_preference_score: user_sorted_result[uid] = sorted(user_preference_score[uid].items(), key=lambda x: x[1], reverse=True) return user_sorted_result
def genTrainData(data, time_alpha, user_buy, user_click, pid_set, ratio, month_num, src_month, src_day): user_buy_num = {} train_buy_pair = [] train_click_pair = [] for entry in data: uid, pid, action_type, month, day = entry if action_type == 1: seg_num = calSegNum(month, day) if uid not in user_buy_num: user_buy_num[uid] = [{} for i in xrange(month_num)] if pid not in user_buy_num[uid][seg_num-1]: user_buy_num[uid][seg_num-1][pid] = [1, [getTimePenality( time_alpha, src_month, src_day, month, day)]] else: user_buy_num[uid][seg_num-1][pid][0] += 1 user_buy_num[uid][seg_num-1][pid][1].append(getTimePenality( time_alpha, src_month, src_day, month, day)) if seg_num == 1: continue neg_pids = random.sample(pid_set-user_buy[uid][seg_num-1], ratio) for neg_pid in neg_pids: train_buy_pair.append([uid, pid, neg_pid, seg_num]) for uid in user_click: for pid in user_click[uid]: train_click_pair.append([uid, pid, user_click[uid][pid]]) return user_buy_num, train_buy_pair, train_click_pair
def getUserMultipleBehavior(data, month_num): user_buy = {} user_click = {} for entry in data: uid, pid, action_type, month, day = entry if action_type == 0 or action_type == 1: seg_num = calSegNum(month, day) if seg_num == 1: continue if action_type == 0: if uid not in user_click: user_click[uid] = {} if pid not in user_click[uid]: user_click[uid][pid] = 1 else: user_click[uid][pid] += 1 #if user_click[uid][pid] > 20: # user_click[uid][pid] = 20 elif action_type == 1: if uid not in user_buy: user_buy[uid] = [set([]) for i in xrange(month_num)] user_buy[uid][seg_num-1].add(pid) for uid in user_click: for pid in user_click[uid]: user_click[uid][pid] = np.log(user_click[uid][pid]) return user_buy, user_click
def getHistorySet1(one_user_history, src_seg_num): history_set = set([]) for entry in one_user_history: pid, month, day = entry[:3] seg_num = calSegNum(month, day) if seg_num == src_seg_num: history_set.add(seg_num) return history_set
def getProductSellNum(): data = [entry for entry in csv.reader(open(settings["TAR_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] product_selluser = {} for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] if action_type == 1: seg_num = calSegNum(month, day) if pid not in product_selluser: product_selluser[pid] = [set(), set(), set()] product_selluser[pid][seg_num-1].add(uid) product_sellnum = {} for pid in product_selluser: total_num = 0 for i in xrange(3): total_num += len(product_selluser[pid][i]) product_sellnum[pid] = total_num/3+1 return product_sellnum
def getProductSellNum(): data = [entry for entry in csv.reader(open(settings["TRAIN_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] product_selluser = {} for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] if action_type == 1: seg_num = calSegNum(month, day) if pid not in product_selluser: product_selluser[pid] = [set(), set(), set()] product_selluser[pid][seg_num - 1].add(uid) product_sellnum = {} for pid in product_selluser: total_num = 0 for i in xrange(3): total_num += len(product_selluser[pid][i]) product_sellnum[pid] = total_num / 3 + 1 return product_sellnum
def genTrainData(data, time_alpha, user_buy, user_click, pid_set, ratio, month_num, src_month, src_day): user_buy_num = {} train_buy_pair = [] train_click_pair = [] for entry in data: uid, pid, action_type, month, day = entry if action_type == 1: seg_num = calSegNum(month, day) if uid not in user_buy_num: user_buy_num[uid] = [{} for i in xrange(month_num)] if pid not in user_buy_num[uid][seg_num - 1]: user_buy_num[uid][seg_num - 1][pid] = [ 1, [ getTimePenality(time_alpha, src_month, src_day, month, day) ] ] else: user_buy_num[uid][seg_num - 1][pid][0] += 1 user_buy_num[uid][seg_num - 1][pid][1].append( getTimePenality(time_alpha, src_month, src_day, month, day)) if seg_num == 1: continue neg_pids = random.sample(pid_set - user_buy[uid][seg_num - 1], ratio) for neg_pid in neg_pids: train_buy_pair.append([uid, pid, neg_pid, seg_num]) for uid in user_click: for pid in user_click[uid]: train_click_pair.append([uid, pid, user_click[uid][pid]]) return user_buy_num, train_buy_pair, train_click_pair
def genTrainPairForBuy(ratio, data): pid_set = set([]) user_behavior = defaultdict(set) for entry in data: pid_set.add(entry[1]) user_behavior[entry[0]].add(entry[1]) train_pairs = [] targets = [] for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] seg_num = calSegNum(month, day) if action_type == 1 and seg_num > 1: train_pairs.append([uid, pid, month, day]) neg_pid_set = random.sample(pid_set - user_behavior[uid], ratio) targets.append([1]) for neg_pid in neg_pid_set: train_pairs.append([uid, neg_pid, month, day]) targets.append([0]) return train_pairs, targets
def genTrainPairForBuy(ratio, data): pid_set = set([]) user_behavior = defaultdict(set) for entry in data: pid_set.add(entry[1]) user_behavior[entry[0]].add(entry[1]) train_pairs = [] targets = [] for entry in data: uid = entry[0] pid = entry[1] action_type = entry[2] month = entry[3] day = entry[4] seg_num = calSegNum(month, day) if action_type == 1 and seg_num > 1: train_pairs.append([uid, pid, month, day]) neg_pid_set = random.sample(pid_set-user_behavior[uid], ratio) targets.append([1]) for neg_pid in neg_pid_set: train_pairs.append([uid, neg_pid, month, day]) targets.append([0]) return train_pairs, targets
def main(): parser = argparse.ArgumentParser() parser.add_argument('-r', type=int, action='store', dest='ratio', help='number of negative to positive') parser.add_argument('-cf', type=int, action='store', dest='tCF', help='whether use collaborative feature') parser.add_argument('-up', type=int, action='store', dest='tUP', help='whether use user product interaction feature') parser.add_argument('-u', type=int, action='store', dest='tU', help='whether use user feature') parser.add_argument('-p', type=int, action='store', dest='tP', help='whether use product feature') if len(sys.argv) != 11: print 'Command e.g.: python makeFeature.py -r 5 -cf 1(0) -up 1(0) '\ + '-u 1(0) -p 1(0)' para = parser.parse_args() user_factor = {} for entry in csv.reader(open(settings["MODEL_USER_FILE"])): uid = int(entry[0]) factor = np.array(map(float, entry[1:])) user_factor[uid] = factor product_factor = {} for entry in csv.reader(open(settings["MODEL_PRODUCT_FILE"])): pid = int(entry[0]) factor = np.array(map(float, entry[1:])) product_factor[pid] = factor data = [entry for entry in csv.reader(open(settings["TRAIN_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] user_behavior = getUserAction(data) user_bought = {} for uid in user_behavior: user_bought[uid] = [0.0 for i in xrange(4)] total_num = [0 for i in xrange(TOTAL_MONTH)] product_set = [set([]) for i in xrange(TOTAL_MONTH)] for pid in user_behavior[uid]: for entry in user_behavior[uid][pid]: if entry[0] == 1: seg_num = calSegNum(entry[1], entry[2]) product_set[seg_num-1].add(pid) total_num[seg_num-1]+= 1 for i in xrange(TOTAL_MONTH): user_bought[uid][0] += len(product_set[i]) user_bought[uid][2] += total_num[i] user_bought[uid][1] = float(user_bought[uid][0])/TOTAL_MONTH user_bought[uid][0] = len(product_set[TOTAL_MONTH-1]) user_bought[uid][3] = float(user_bought[uid][2])/TOTAL_MONTH user_bought[uid][2] = total_num[TOTAL_MONTH-1] product_behavior = getProductAction(data) product_bought = {} for pid in product_behavior: product_bought[pid] = [0.0 for i in xrange(4)] total_num = [0 for i in xrange(TOTAL_MONTH)] user_set = [set([]) for i in xrange(TOTAL_MONTH)] for uid in product_behavior[pid]: for entry in product_behavior[pid][uid]: if entry[0] == 1: seg_num = calSegNum(entry[1], entry[2]) user_set[seg_num-1].add(uid) total_num[seg_num-1] += 1 for i in xrange(TOTAL_MONTH): product_bought[pid][0] += len(user_set[i]) product_bought[pid][2] = total_num[i] product_bought[pid][1] = float(product_bought[pid][0])/TOTAL_MONTH product_bought[pid][0] = len(user_set[TOTAL_MONTH-1]) product_bought[pid][3] = float(product_bought[pid][2])/TOTAL_MONTH product_bought[pid][2] = total_num[TOTAL_MONTH-1] data = [entry for entry in csv.reader(open(settings["TAR_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] test_pairs = genTestPairForBuy(data) user_behavior = getUserAction(data) writer = csv.writer(open(settings["GBT_TEST_FILE"], "w"), lineterminator="\n") output_feature = [0 for i in range(59)] score = 0.0 d_day = 14 d_month = 7 w_day = 8 w_month = 7 m_day = 14 m_month = 6 tmp_cnt = np.array([0 for i in range(16)]) print "Start generating features...." a = time.clock() for ii, pair in enumerate(test_pairs): uid = pair[0] pid = pair[1] output_feature[0] = uid output_feature[1] = pid if para.tCF == 1: if pid not in product_factor: score = 0.0 elif uid not in user_factor: score = 0.0 else: score = np.dot(user_factor[uid], product_factor[pid]) output_feature[2] = score if para.tUP == 1: for entry in user_behavior[uid][pid]: action_type = entry[0] src_month = entry[1] src_day = entry[2] if src_month == d_month: if src_day == d_day: output_feature[3+action_type*3] = 1 output_feature[3+action_type*3+1] += 1 tmp_cnt[action_type] += 1 output_feature[15+action_type*3] = 1 output_feature[15+action_type*3+1] += 1 tmp_cnt[4+action_type] += 1 elif w_day <= src_day: output_feature[15+action_type*3] = 1 output_feature[15+action_type*3+1] += 1 tmp_cnt[4+action_type] += 1 output_feature[27+action_type*3] = 1 output_feature[27+action_type*3+1] += 1 tmp_cnt[8+action_type] += 1 elif src_month == m_month and src_day > m_day: output_feature[27+action_type*3] = 1 output_feature[27+action_type*3+1] += 1 tmp_cnt[8+action_type] += 1 output_feature[39+action_type*3] = 1 output_feature[39+action_type*3+1] += 1 tmp_cnt[12+action_type] += 1 for i in xrange(16): if tmp_cnt[i] == 0: output_feature[5+i*3] = 0 else: output_feature[5+i*3] = float(output_feature[5+i*3])/tmp_cnt[i] tmp_cnt[i] = 0 if para.tU == 1: if uid not in user_bought: output_feature[51] = 0 output_feature[52] = 0 output_feature[53] = 0 output_feature[54] = 0 else: output_feature[51] = user_bought[uid][0] output_feature[52] = user_bought[uid][1] output_feature[53] = user_bought[uid][2] output_feature[54] = user_bought[uid][3] if para.tP == 1: if pid not in product_bought: output_feature[55] = 0 output_feature[56] = 0 output_feature[57] = 0 output_feature[58] = 0 else: output_feature[55] = product_bought[pid][0] output_feature[56] = product_bought[pid][1] output_feature[57] = product_bought[pid][2] output_feature[58] = product_bought[pid][3] writer.writerow(output_feature) output_feature = np.array([0.0 for i in range(59)]) if ii % 10000 == 0: print "\r%d, cost time: %.1f seconds" % (ii, time.clock() - a) a = time.clock()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-r', type=int, action='store', dest='ratio', help='number of negative to positive') parser.add_argument('-cf', type=int, action='store', dest='tCF', help='whether use collaborative feature') parser.add_argument('-up', type=int, action='store', dest='tUP', help='whether use user product interaction feature') parser.add_argument('-u', type=int, action='store', dest='tU', help='whether use user feature') parser.add_argument('-p', type=int, action='store', dest='tP', help='whether use product feature') if len(sys.argv) != 11: print 'Command e.g.: python makeFeature.py -r 5 -cf 1(0) -up 1(0) '\ + '-u 1(0) -p 1(0)' para = parser.parse_args() user_factor = {} for entry in csv.reader(open(settings["MODEL_USER_FILE"])): uid = int(entry[0]) factor = np.array(map(float, entry[1:])) user_factor[uid] = factor product_factor = {} for entry in csv.reader(open(settings["MODEL_PRODUCT_FILE"])): pid = int(entry[0]) factor = np.array(map(float, entry[1:])) product_factor[pid] = factor data = [entry for entry in csv.reader(open(settings["TRAIN_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] user_behavior = getUserAction(data) user_bought = {} for uid in user_behavior: user_bought[uid] = [0.0 for i in xrange(4)] total_num = [0 for i in xrange(TOTAL_MONTH)] product_set = [set([]) for i in xrange(TOTAL_MONTH)] for pid in user_behavior[uid]: for entry in user_behavior[uid][pid]: if entry[0] == 1: seg_num = calSegNum(entry[1], entry[2]) product_set[seg_num - 1].add(pid) total_num[seg_num - 1] += 1 for i in xrange(TOTAL_MONTH): user_bought[uid][0] += len(product_set[i]) user_bought[uid][2] += total_num[i] user_bought[uid][1] = float(user_bought[uid][0]) / TOTAL_MONTH user_bought[uid][0] = len(product_set[TOTAL_MONTH - 1]) user_bought[uid][3] = float(user_bought[uid][2]) / TOTAL_MONTH user_bought[uid][2] = total_num[TOTAL_MONTH - 1] product_behavior = getProductAction(data) product_bought = {} for pid in product_behavior: product_bought[pid] = [0.0 for i in xrange(4)] total_num = [0 for i in xrange(TOTAL_MONTH)] user_set = [set([]) for i in xrange(TOTAL_MONTH)] for uid in product_behavior[pid]: for entry in product_behavior[pid][uid]: if entry[0] == 1: seg_num = calSegNum(entry[1], entry[2]) user_set[seg_num - 1].add(uid) total_num[seg_num - 1] += 1 for i in xrange(TOTAL_MONTH): product_bought[pid][0] += len(user_set[i]) product_bought[pid][2] = total_num[i] product_bought[pid][1] = float(product_bought[pid][0]) / TOTAL_MONTH product_bought[pid][0] = len(user_set[TOTAL_MONTH - 1]) product_bought[pid][3] = float(product_bought[pid][2]) / TOTAL_MONTH product_bought[pid][2] = total_num[TOTAL_MONTH - 1] data = [entry for entry in csv.reader(open(settings["TAR_DATA_FILE"]))] data = [map(int, entry) for entry in data[1:]] test_pairs = genTestPairForBuy(data) user_behavior = getUserAction(data) writer = csv.writer(open(settings["GBT_TEST_FILE"], "w"), lineterminator="\n") output_feature = [0 for i in range(59)] score = 0.0 d_day = 14 d_month = 7 w_day = 8 w_month = 7 m_day = 14 m_month = 6 tmp_cnt = np.array([0 for i in range(16)]) print "Start generating features...." a = time.clock() for ii, pair in enumerate(test_pairs): uid = pair[0] pid = pair[1] output_feature[0] = uid output_feature[1] = pid if para.tCF == 1: if pid not in product_factor: score = 0.0 elif uid not in user_factor: score = 0.0 else: score = np.dot(user_factor[uid], product_factor[pid]) output_feature[2] = score if para.tUP == 1: for entry in user_behavior[uid][pid]: action_type = entry[0] src_month = entry[1] src_day = entry[2] if src_month == d_month: if src_day == d_day: output_feature[3 + action_type * 3] = 1 output_feature[3 + action_type * 3 + 1] += 1 tmp_cnt[action_type] += 1 output_feature[15 + action_type * 3] = 1 output_feature[15 + action_type * 3 + 1] += 1 tmp_cnt[4 + action_type] += 1 elif w_day <= src_day: output_feature[15 + action_type * 3] = 1 output_feature[15 + action_type * 3 + 1] += 1 tmp_cnt[4 + action_type] += 1 output_feature[27 + action_type * 3] = 1 output_feature[27 + action_type * 3 + 1] += 1 tmp_cnt[8 + action_type] += 1 elif src_month == m_month and src_day > m_day: output_feature[27 + action_type * 3] = 1 output_feature[27 + action_type * 3 + 1] += 1 tmp_cnt[8 + action_type] += 1 output_feature[39 + action_type * 3] = 1 output_feature[39 + action_type * 3 + 1] += 1 tmp_cnt[12 + action_type] += 1 for i in xrange(16): if tmp_cnt[i] == 0: output_feature[5 + i * 3] = 0 else: output_feature[5 + i * 3] = float( output_feature[5 + i * 3]) / tmp_cnt[i] tmp_cnt[i] = 0 if para.tU == 1: if uid not in user_bought: output_feature[51] = 0 output_feature[52] = 0 output_feature[53] = 0 output_feature[54] = 0 else: output_feature[51] = user_bought[uid][0] output_feature[52] = user_bought[uid][1] output_feature[53] = user_bought[uid][2] output_feature[54] = user_bought[uid][3] if para.tP == 1: if pid not in product_bought: output_feature[55] = 0 output_feature[56] = 0 output_feature[57] = 0 output_feature[58] = 0 else: output_feature[55] = product_bought[pid][0] output_feature[56] = product_bought[pid][1] output_feature[57] = product_bought[pid][2] output_feature[58] = product_bought[pid][3] writer.writerow(output_feature) output_feature = np.array([0.0 for i in range(59)]) if ii % 10000 == 0: print "\r%d, cost time: %.1f seconds" % (ii, time.clock() - a) a = time.clock()