def __init__(self, p=None): """ 构建模型参数,加载数据 把前80%分为6:2用作train和valid,来选择超参数, 不用去管剩下的20%. 把前80%作为train,剩下的是test,把valid时学到的参数拿过来跑程序. valid和test部分,程序是一样的,区别在于送入的数据而已。 :param p: 一个标示符,没啥用 :return: """ global PATH # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。 if not p: v = 1 # 写1就是valid, 写0就是test assert 0 == v or 1 == v # no other case p = OrderedDict([ ('dataset', 'user_buys.txt'), ('fea_image', 'normalized_features_image/'), ('fea_text', 'normalized_features_text/'), ('mode', 'valid' if 1 == v else 'test'), ('split', 0.8), # valid: 6/2/2。test: 8/2. ('at_nums', [10, 20, 30, 50]), ('intervals', [2, 10, 30 ]), # 以次数2为间隔,分为10个区间. 计算auc/recall@30上的. 换为10 ('epochs', 30 if 'taobao' in PATH else 50), ('fea_random_zero', 0.0), # 0.2 / 0.4 ('latent_size', [20, 1024, 100]), ('alpha', 0.1), ('lambda', 0.0), # 要不要self.lt和self.ux/wh/bi用不同的lambda? ('lambda_ev', 0.0), # 图文降维局矩阵的。就是这个0.0 ('lambda_ae', None), # 重构误差的。 ('mini_batch', None), # 0:one_by_one, 1:mini_batch ('mvgru', 0), # 0:bpr, # 1:vbpr ('batch_size_train', 1), # size大了之后性能下降非常严重 ('batch_size_test', 768), # user*item矩阵太大,要多次计算。a5下亲测768最快。 ]) for i in p.items(): print(i) assert 'valid' == p['mode'] or 'test' == p['mode'] # 2. 加载数据 # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错. [(user_num, item_num), aliases_dict, (test_i_cou, test_i_intervals_cumsum, test_i_cold_active), (tra_buys, tes_buys)] = \ load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'], p['intervals']) # 正样本加masks tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys, tail=[item_num ]) # 预测时算用户表达用 tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys, tail=[item_num ]) # 预测时用 # 负样本加masks tra_buys_neg_masks = fun_random_neg_tra( item_num, tra_buys_masks) # 训练时用(逐条、mini-batch均可) tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks, tes_buys_masks) # 预测时用 # 3. 创建类变量 self.p = p self.user_num, self.item_num = user_num, item_num self.aliases_dict = aliases_dict self.tic, self.tiic, self.tica = test_i_cou, test_i_intervals_cumsum, test_i_cold_active self.tra_buys, self.tes_buys = tra_buys, tes_buys self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
def train_valid_or_test(p=None): """ 构建模型参数,加载数据 把前80%分为6:2用作train和valid,来选择超参数, 不用去管剩下的20%. 把前80%作为train,剩下的是test,把valid时学到的参数拿过来跑程序. valid和test部分,程序是一样的,区别在于送入的数据而已。 :param p: 一个标示符,没啥用 :return: """ global PATH # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。 if not p: v = 1 # 写1就是valid, 写0就是test assert 0 == v or 1 == v # no other case p = OrderedDict([ ('dataset', 'user_buys.txt'), ('fea_image', 'normalized_features_image/'), ('fea_text', 'normalized_features_text/'), ('mode', 'valid' if 1 == v else 'test'), ('split', 0.8), # valid: 6/2/2。test: 8/2. ('at_nums', [10, 20, 30, 50]), # 5, 15 ('intervals', [2, 10, 30]), # 以次数2为间隔,分为10个区间. 计算auc/recall@30上的. 换为10 ('batch_size_train', 4), # size大了之后性能下降非常严重 ('batch_size_test', 768), # user*item矩阵太大,要多次计算。a5下亲测768最快。 ]) for e in p.items(): print(e) assert 'valid' == p['mode'] or 'test' == p['mode'] # 2. 加载数据 # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错. [(user_num, item_num), aliases_dict, (test_i_cou, test_i_intervals_cumsum, test_i_cold_active), (tra_buys, tes_buys)] = \ load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'], p['intervals']) # 正样本加masks tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys, tail=[item_num ]) # 预测时算用户表达用 tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys, tail=[item_num]) # 预测时用 # 负样本加masks # tra_buys_neg_masks = fun_random_neg_tra(item_num, tra_buys_masks) # 训练时用(逐条、mini-batch均可) tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks, tes_buys_masks) # 预测时用 # -------------------------------------------------------------------------------------------------------------- # 获得按购买次数由大到小排序的items, 出现次数相同的,随机排列。 tra = [] for buy in tra_buys: tra.extend(buy) train_i = set(tra) train_i_cou = dict(Counter(tra)) # {item: num, }, 各个item出现的次数 lst = defaultdict(list) for item, count in train_i_cou.items(): lst[count].append(item) # 某个被购买次数(count)下各有哪些商品,商品数目是count。count越大,这些items越popular lst = list(lst.items()) # [(num, [item1, item2, ...]), ] lst = list(sorted(lst, key=lambda x: x[0]))[::-1] # 被购买次数多的,出现在首端 sequence = [] for count, items in lst: sequence.extend(random.sample(items, len(items))) # 某个购买次数下的各商品,随机排列。 def fun_judge_tes_and_neg(tes_mark_neg): tes, mark, tes_neg, _ = tes_mark_neg zero_one = [] for idx, flag in enumerate(mark): if 0 == flag: zero_one.append(0) else: i, j = tes[idx], tes_neg[idx] if i in train_i and j in train_i: zero_one.append( 1 if train_i_cou[i] > train_i_cou[j] else 0) elif i in train_i and j not in train_i: zero_one.append(1) elif i not in train_i and j in train_i: zero_one.append(0) else: zero_one.append(0) return zero_one # 与mask等长的0/1序列。1表示用户买的商品比负样本更流行。 # -------------------------------------------------------------------------------------------------------------- print("\tPop ...") append = [[0] for _ in np.arange(len(tes_buys_masks))] all_upqs = np.apply_along_axis( # 判断tes里的是否比tes_neg更流行 func1d=fun_judge_tes_and_neg, axis=1, arr=np.array(zip(tes_buys_masks, tes_masks, tes_buys_neg_masks, append))) recom = sequence[:p['at_nums'][-1]] # 每个用户都给推荐前100个最流行的 all_ranks = np.array([recom for _ in np.arange(user_num)]) # 存放最优数据。计算各种指标并输出。 best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals']) fun_predict_pop_random(p, best, all_upqs, all_ranks, tes_buys_masks, tes_masks, test_i_cou, test_i_intervals_cumsum, test_i_cold_active) best.fun_print_best(epoch=0) # 每次都只输出当前最优的结果 # -------------------------------------------------------------------------------------------------------------- print("\tRandom ...") all_upqs = None # random的auc就是0.5,直接引用文献里的说法。 seq_random = sample(sequence, len(sequence)) # 先把总序列打乱顺序。再每个用户都给随机推荐100个 all_ranks = np.array( [sample(seq_random, p['at_nums'][-1]) for _ in np.arange(user_num)]) # 存放最优数据。计算各种指标并输出。 best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals']) fun_predict_pop_random(p, best, all_upqs, all_ranks, tes_buys_masks, tes_masks, test_i_cou, test_i_intervals_cumsum, test_i_cold_active) best.fun_print_best(epoch=0) # 每次都只输出当前最优的结果