コード例 #1
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前80%分为6:2用作train和valid,来选择超参数, 不用去管剩下的20%.
            把前80%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        global PATH
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            v = 1  # 写1就是valid, 写0就是test
            assert 0 == v or 1 == v  # no other case
            p = OrderedDict([
                ('dataset', 'user_buys.txt'),
                ('fea_image', 'normalized_features_image/'),
                ('fea_text', 'normalized_features_text/'),
                ('mode', 'valid' if 1 == v else 'test'),
                ('split', 0.8),  # valid: 6/2/2。test: 8/2.
                ('at_nums', [10, 20, 30, 50]),
                ('intervals', [2, 10, 30
                               ]),  # 以次数2为间隔,分为10个区间. 计算auc/recall@30上的. 换为10
                ('epochs', 30 if 'taobao' in PATH else 50),
                ('fea_random_zero', 0.0),  # 0.2 / 0.4
                ('latent_size', [20, 1024, 100]),
                ('alpha', 0.1),
                ('lambda', 0.0),  # 要不要self.lt和self.ux/wh/bi用不同的lambda?
                ('lambda_ev', 0.0),  # 图文降维局矩阵的。就是这个0.0
                ('lambda_ae', None),  # 重构误差的。
                ('mini_batch', None),  # 0:one_by_one,     1:mini_batch
                ('mvgru', 0),  # 0:bpr, # 1:vbpr
                ('batch_size_train', 1),  # size大了之后性能下降非常严重
                ('batch_size_test', 768),  # user*item矩阵太大,要多次计算。a5下亲测768最快。
            ])
            for i in p.items():
                print(i)
            assert 'valid' == p['mode'] or 'test' == p['mode']

        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        [(user_num, item_num), aliases_dict,
         (test_i_cou, test_i_intervals_cumsum, test_i_cold_active),
         (tra_buys, tes_buys)] = \
            load_data(os.path.join(PATH, p['dataset']),
                      p['mode'], p['split'], p['intervals'])
        # 正样本加masks
        tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys,
                                                        tail=[item_num
                                                              ])  # 预测时算用户表达用
        tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys,
                                                        tail=[item_num
                                                              ])  # 预测时用
        # 负样本加masks
        tra_buys_neg_masks = fun_random_neg_tra(
            item_num, tra_buys_masks)  # 训练时用(逐条、mini-batch均可)
        tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks,
                                                tes_buys_masks)  # 预测时用

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num = user_num, item_num
        self.aliases_dict = aliases_dict
        self.tic, self.tiic, self.tica = test_i_cou, test_i_intervals_cumsum, test_i_cold_active
        self.tra_buys, self.tes_buys = tra_buys, tes_buys
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
コード例 #2
0
def train_valid_or_test():
    """
    主程序
    :return:
    """
    # 建立参数、数据、模型、模型最佳值
    pas = Params()
    p = pas.p
    model, model_name, size_total = pas.build_model_mini_batch(flag=p['mvgru'])
    best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals'])   # 存放最优数据
    batch_idxs_tra, starts_ends_tra = pas.compute_start_end(flag='train')
    _, starts_ends_tes = pas.compute_start_end(flag='test')
    _, starts_ends_auc = pas.compute_start_end(flag='auc')

    # 直接取出来部分变量,后边就不用加'pas.'了。
    user_num, item_num = pas.user_num, pas.item_num
    tra_buys_masks, tes_buys_masks = pas.tra_buys_masks, pas.tes_buys_masks
    tes_masks = pas.tes_masks
    test_i_cou, test_i_intervals_cumsum, test_i_cold_active = pas.tic, pas.tiic, pas.tica
    del pas

    # 主循环
    losses = []
    times0, times1, times2 = [], [], []
    epochs = p['epochs']  # 90-taobao, 150-amazon   # 这个是子网络分开学,要3倍的epoch
    print('mvgru =', p['mvgru'], 'epochs =', epochs)
    for epoch in np.arange(epochs):
        print("Epoch {val} ==================================".format(val=epoch))
        # 每次epoch,都要重新选择负样本。都要把数据打乱重排,这样会以随机方式选择样本计算梯度,可得到精确结果
        if epoch > 0:       # epoch=0的负样本已在循环前生成,且已用于类的初始化
            tra_buys_neg_masks = fun_random_neg_tra(item_num, tra_buys_masks)
            tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks, tes_buys_masks)
            model.update_neg_masks(tra_buys_neg_masks, tes_buys_neg_masks)

        # --------------------------------------------------------------------------------------------------------------
        print("\tTraining ...")
        t0 = time.time()
        loss = 0.
        random.seed(str(123 + epoch))
        random.shuffle(batch_idxs_tra)      # 每个epoch都打乱batch_idx输入顺序
        for bidx in batch_idxs_tra:
            start_end = starts_ends_tra[bidx]
            random.shuffle(start_end)       # 打乱batch内的indexes
            loss += model.train(idxs=start_end, epoch_n=epochs, epoch_i=epoch)
        rnn_l2_sqr = model.l2.eval()            # model.l2是'TensorVariable',无法直接显示其值
        print('\t\tsum_loss = {val} = {v1} + {v2}'.format(val=loss + rnn_l2_sqr, v1=loss, v2=rnn_l2_sqr))
        losses.append('%0.2f' % (loss + rnn_l2_sqr))
        t1 = time.time()
        times0.append(t1 - t0)

        # --------------------------------------------------------------------------------------------------------------

        print("\tPredicting ...")
        # 计算:所有用户、商品的表达
        model.pgru_update_trained_items(epoch_n=epochs, epoch_i=epoch)  # 要先运行这个更新items特征。
        all_hus = np.array([[0.0 for _ in np.arange(size_total)]])          # 初始shape=(1, 20/40)
        for start_end in starts_ends_tes:
            sub_all_hus = model.predict(start_end)
            all_hus = np.concatenate((all_hus, sub_all_hus))
        all_hus = np.delete(all_hus, 0, axis=0)         # 去除第一行全0项,  # shape=(n_user, n_hidden)
        model.update_trained_users(all_hus)
        t2 = time.time()
        times1.append(t2 - t1)

        # 计算各种指标,并输出当前最优值。
        fun_predict_auc_recall_map_ndcg(
            p, model, best, epoch, starts_ends_auc, starts_ends_tes,
            tes_buys_masks, tes_masks,
            test_i_cou, test_i_intervals_cumsum, test_i_cold_active)
        best.fun_print_best(epoch)   # 每次都只输出当前最优的结果
        t3 = time.time()
        times2.append(t3-t2)
        print('\taverage time (train, user, evaluate): %0.2fs,' % np.average(times0),
              '%0.2fs,' % np.average(times1),
              '%0.2fs,' % np.average(times2),
              datetime.datetime.now().strftime("%Y.%m.%d %H:%M:%S"),
              '| model: %s' % model_name,
              '| lam: %s' % str(p['lambda']))

        # --------------------------------------------------------------------------------------------------------------
        # 保存epoch=29/49时的最优值。
        if epoch == epochs - 1:     # p['epochs']
            print("\tBest saving ...")
            path = os.path.join(os.path.split(__file__)[0], '..', 'Results_best_values', PATH.split('/')[-2])
            best.fun_save_best(
                path, model_name, epoch, [p['batch_size_train'], p['batch_size_test']],
                [p['alpha'], p['lambda'], p['lambda_ev'], p['lambda_ae'], p['fea_random_zero']])

        # --------------------------------------------------------------------------------------------------------------
        # 保存所有的损失值。
        if epoch == epochs - 1:     # p['epochs']
            print("\tLoss saving ...")
            path = os.path.join(os.path.split(__file__)[0], '..', 'Results_alpha_0.1_loss', PATH.split('/')[-2])
            fun_save_all_losses(
                path, model_name, epoch, losses,
                [p['alpha'], p['lambda'], p['lambda_ev'], p['lambda_ae'], p['fea_random_zero']])

    for i in p.items():
        print(i)
    print('\t the current Class name is: {val}'.format(val=model_name))
コード例 #3
0
def train_valid_or_test():
    """
    主程序
    :return:
    """
    # 建立参数、数据、模型、模型最佳值
    pas = Params()
    p = pas.p
    model, model_name, size_total = pas.build_model_one_by_one(flag=p['mvgru'])
    best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals'])  # 存放最优数据
    _, starts_ends_tes = pas.compute_start_end(flag='test')
    _, starts_ends_auc = pas.compute_start_end(flag='auc')

    # 直接取出来部分变量,后边就不用加'pas.'了。
    user_num, item_num = pas.user_num, pas.item_num
    tra_buys_masks, tra_masks = np.asarray(pas.tra_buys_masks), np.asarray(
        pas.tra_masks)
    tes_buys_masks, tes_masks = np.asarray(pas.tes_buys_masks), np.asarray(
        pas.tes_masks)
    tra_buys_neg_masks = np.asarray(pas.tra_buys_neg_masks)
    test_i_cou, test_i_intervals_cumsum, test_i_cold_active = pas.tic, pas.tiic, pas.tica
    del pas

    # 主循环
    losses = []
    times0, times1, times2 = [], [], []
    for epoch in np.arange(p['epochs']):
        print(
            "Epoch {val} ==================================".format(val=epoch))
        # 每次epoch,都要重新选择负样本。都要把数据打乱重排,这样会以随机方式选择样本计算梯度,可得到精确结果
        if epoch > 0:  # epoch=0的负样本已在循环前生成,且已用于类的初始化
            tra_buys_neg_masks = np.asarray(
                fun_random_neg_tra(item_num, tra_buys_masks))
            tes_buys_neg_masks = np.asarray(
                fun_random_neg_tes(item_num, tra_buys_masks, tes_buys_masks))
            model.update_neg_masks(tra_buys_neg_masks, tes_buys_neg_masks)

        # --------------------------------------------------------------------------------------------------------------
        print("\tTraining ...")
        t0 = time.time()
        loss = 0.
        random.seed(str(123 + epoch))
        user_idxs_tra = np.arange(user_num, dtype=np.int32)
        random.shuffle(user_idxs_tra)  # 每个epoch都打乱user_id输入顺序
        for uidx in user_idxs_tra:
            tra = tra_buys_masks[uidx]
            neg = tra_buys_neg_masks[uidx]
            for i in np.arange(sum(tra_masks[uidx])):
                loss += model.train(uidx, [tra[i], neg[i]])
        rnn_l2_sqr = model.l2.eval()  # model.l2是'TensorVariable',无法直接显示其值
        print('\t\tsum_loss = {val} = {v1} + {v2}'.format(val=loss +
                                                          rnn_l2_sqr,
                                                          v1=loss,
                                                          v2=rnn_l2_sqr))
        losses.append('%0.2f' % (loss + rnn_l2_sqr))
        t1 = time.time()
        times0.append(t1 - t0)

        # --------------------------------------------------------------------------------------------------------------
        print("\tPredicting ...")
        # 计算:所有用户、商品的表达
        model.update_trained_items()  # 要先运行这个更新items特征。对于MV-GRU,这里会先算出来图文融合特征。
        model.update_trained_users()
        t2 = time.time()
        times1.append(t2 - t1)

        # 计算各种指标,并输出当前最优值。
        fun_predict_auc_recall_map_ndcg(p, model, best, epoch, starts_ends_auc,
                                        starts_ends_tes, tes_buys_masks,
                                        tes_masks, test_i_cou,
                                        test_i_intervals_cumsum,
                                        test_i_cold_active)
        best.fun_print_best(epoch)  # 每次都只输出当前最优的结果
        t3 = time.time()
        times2.append(t3 - t2)
        print(
            '\taverage time (train, user, evaluate): %0.2fs,' %
            np.average(times0), '%0.2fs,' % np.average(times1),
            '%0.2fs,' % np.average(times2),
            datetime.datetime.now().strftime("%Y.%m.%d %H:%M:%S"),
            '| model: %s' % model_name, '| lam: %s' % ', '.join([
                str(lam)
                for lam in [p['lambda'], p['lambda_ev'], p['lambda_ae']]
            ]))

        # --------------------------------------------------------------------------------------------------------------
        # 保存epoch=29/49时的最优值。
        if epoch == p['epochs'] - 1:
            print("\tBest saving ...")
            path = os.path.join(
                os.path.split(__file__)[0], '..', 'Results_best_values',
                PATH.split('/')[-2])
            best.fun_save_best(path, model_name, epoch,
                               [p['batch_size_train'], p['batch_size_test']], [
                                   p['alpha'], p['lambda'], p['lambda_ev'],
                                   p['lambda_ae'], p['fea_random_zero']
                               ])

        # --------------------------------------------------------------------------------------------------------------
        # 保存所有的损失值。
        if epoch == p['epochs'] - 1:
            print("\tLoss saving ...")
            path = os.path.join(
                os.path.split(__file__)[0], '..', 'Results_alpha_0.1_loss',
                PATH.split('/')[-2])
            fun_save_all_losses(path, model_name, epoch, losses, [
                p['alpha'], p['lambda'], p['lambda_ev'], p['lambda_ae'],
                p['fea_random_zero']
            ])

    for i in p.items():
        print(i)
    print('\t the current Class name is: {val}'.format(val=model_name))
コード例 #4
0
def train_valid_or_test():
    """
    主程序
    :return:
    """
    global PATH
    # 建立参数、数据、模型、模型最佳值
    pas = Params()
    p = pas.p
    model, model_name, size_total = pas.build_model_mini_batch(flag=p['mvgru'])
    best_denoise = GlobalBest(at_nums=p['at_nums'],
                              intervals=p['intervals'])  # 存放最优数据
    best_missing = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals'])
    batch_idxs_tra, starts_ends_tra = pas.compute_start_end(flag='train')
    _, starts_ends_tes = pas.compute_start_end(flag='test')
    _, starts_ends_auc = pas.compute_start_end(flag='auc')

    # 直接取出来部分变量,后边就不用加'pas.'了。
    user_num, item_num = pas.user_num, pas.item_num
    tra_buys_masks, tes_buys_masks = pas.tra_buys_masks, pas.tes_buys_masks
    tes_masks = pas.tes_masks
    test_i_cou, test_i_intervals_cumsum, test_i_cold_active = pas.tic, pas.tiic, pas.tica
    del pas

    # 主循环
    losses = []
    times0, times1, times2, times3 = [], [], [], []
    for epoch in np.arange(p['epochs']):
        print(
            "Epoch {val} ==================================".format(val=epoch))
        # 每次epoch,都要重新选择负样本。都要把数据打乱重排,这样会以随机方式选择样本计算梯度,可得到精确结果
        if epoch > 0:  # epoch=0的负样本已在循环前生成,且已用于类的初始化
            tra_buys_neg_masks = fun_random_neg_tra(item_num, tra_buys_masks)
            tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks,
                                                    tes_buys_masks)
            model.update_neg_masks(tra_buys_neg_masks, tes_buys_neg_masks)

        # --------------------------------------------------------------------------------------------------------------
        print("\tTraining ...")
        t0 = time.time()
        loss = 0.
        random.seed(str(123 + epoch))
        random.shuffle(batch_idxs_tra)  # 每个epoch都打乱batch_idx输入顺序
        for bidx in batch_idxs_tra:
            start_end = starts_ends_tra[bidx]
            random.shuffle(start_end)  # 打乱batch内的indexes
            loss += model.train(start_end)
        rnn_l2_sqr = model.l2.eval()  # model.l2是'TensorVariable',无法直接显示其值
        print('\t\tsum_loss = {val} = {v1} + {v2}'.format(val=loss +
                                                          rnn_l2_sqr,
                                                          v1=loss,
                                                          v2=rnn_l2_sqr))
        losses.append('%0.2f' % (loss + rnn_l2_sqr))
        t1 = time.time()
        times0.append(t1 - t0)

        # --------------------------------------------------------------------------------------------------------------
        print("\tPredicting ...")
        # 计算:所有用户、商品的表达
        model.update_trained_items()  # 要先运行这个更新items特征。对于MV-GRU,这里会先算出来图文融合特征。
        all_hus = np.array([[0.0 for _ in np.arange(size_total)]
                            ])  # 初始shape=(1, 20/40)
        for start_end in starts_ends_tes:
            sub_all_hus = model.predict(start_end)
            all_hus = np.concatenate((all_hus, sub_all_hus))
        all_hus = np.delete(all_hus, 0,
                            axis=0)  # 去除第一行全0项,  # shape=(n_user, n_hidden)
        model.update_trained_users(all_hus)
        t2 = time.time()
        times1.append(t2 - t1)

        # denoise模式:test用完整数据。
        fun_predict_auc_recall_map_ndcg(p, model, best_denoise, epoch,
                                        starts_ends_auc, starts_ends_tes,
                                        tes_buys_masks, tes_masks, test_i_cou,
                                        test_i_intervals_cumsum,
                                        test_i_cold_active)
        best_denoise.fun_print_best(epoch)  # 每次都只输出当前最优的结果
        t3 = time.time()
        times2.append(t3 - t2)
        print(
            '\tdenoise: avg. time (train, user, test): %0.0fs,' %
            np.average(times0), '%0.0fs,' % np.average(times1),
            '%0.0fs |' % np.average(times2),
            datetime.datetime.now().strftime("%Y.%m.%d %H:%M"),
            '| model: %s' % model_name, '| lam: %s' % ', '.join([
                str(lam)
                for lam in [p['lambda'], p['lambda_ev'], p['lambda_ae']]
            ]), '| train_fea_zero: %0.1f' % p['train_fea_zero'])

        if 'MvGru' in model_name:
            # missing模式:test用缺失数据。
            model.update_trained_items2_corrupted_test_data(
            )  # 注意:missing下的test data是有破损的。
            fun_predict_auc_recall_map_ndcg(p, model, best_missing, epoch,
                                            starts_ends_auc, starts_ends_tes,
                                            tes_buys_masks, tes_masks,
                                            test_i_cou,
                                            test_i_intervals_cumsum,
                                            test_i_cold_active)
            best_missing.fun_print_best(epoch)  # 每次都只输出当前最优的结果
            t4 = time.time()
            times3.append(t4 - t3)
            print(
                '\tmissing: avg. time (train, user, test): %0.0fs,' %
                np.average(times0), '%0.0fs,' % np.average(times1),
                '%0.0fs |' % np.average(times3),
                datetime.datetime.now().strftime("%Y.%m.%d %H:%M"),
                '| model: %s' % model_name, '| lam: %s' % ', '.join([
                    str(lam)
                    for lam in [p['lambda'], p['lambda_ev'], p['lambda_ae']]
                ]), '| train_fea_zero: %0.1f' % p['train_fea_zero'])

        # --------------------------------------------------------------------------------------------------------------
        # 保存epoch=29/49时的最优值。
        if epoch == p['epochs'] - 1:
            print(
                "\t-----------------------------------------------------------------"
            )
            print("\tBest saving ...")
            path = os.path.join(
                os.path.split(__file__)[0], '..', 'Results_best_values',
                PATH.split('/')[-2])
            best_denoise.fun_save_best(
                path, model_name + ' - denoise', epoch,
                [p['batch_size_train'], p['batch_size_test']], [
                    p['alpha'], p['lambda'], p['lambda_ev'], p['lambda_ae'],
                    p['train_fea_zero']
                ])
            if 'MvGru' in model_name:
                best_missing.fun_save_best(
                    path, model_name + ' - missing', epoch,
                    [p['batch_size_train'], p['batch_size_test']], [
                        p['alpha'], p['lambda'], p['lambda_ev'],
                        p['lambda_ae'], p['train_fea_zero']
                    ])

        # --------------------------------------------------------------------------------------------------------------
        # 保存所有的损失值。
        if epoch == p['epochs'] - 1:
            print("\tLoss saving ...")
            path = os.path.join(
                os.path.split(__file__)[0], '..', 'Results_alpha_0.1_loss',
                PATH.split('/')[-2])
            fun_save_all_losses(path, model_name, epoch, losses, [
                p['alpha'], p['lambda'], p['lambda_ev'], p['lambda_ae'],
                p['train_fea_zero']
            ])

    for i in p.items():
        print(i)
    print('\t the current Class name is: {val}'.format(val=model_name))