Esempio n. 1
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前90%分为8:1用作train和valid,来选择超参数, 不用去管剩下的10%.
            把前90%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            t = 't'  # 写1就是valid, 写0就是test
            assert 't' == t or 'v' == t  # no other case
            p = OrderedDict([
                ('dataset', 'user_buys.txt'),
                ('mode', 'test' if 't' == t else 'valid'),
                ('split',
                 [0.8, 1.0] if 't' == t else [0.6, 0.8]),  # no third case
                ('at_nums', [5, 10, 15, 20]),
                ('epochs', 100),
                ('latent_size', 20),
                ('alpha', 0.01),
                ('lambda', 0.001),  # 实测表明,不需要为权重单独建立lambda。
                ('window_x', 5),  # 2,3,4,5
                ('window_h', 5),
                ('zoneout_cell', 0.5),
                ('zoneout_hidd', 0.05),
                ('mini_batch', 0),  # 0:one_by_one【用这个】, 1:mini_batch
                ('hcagru', 4),  # 0:gru, 1:zoneout
                # 2:hca_x, 3:hca_h, 4:hca
                ('batch_size_train', 4),  # 4比较合适。
                ('batch_size_test', 768),  # user * item 矩阵太大了,分成多次计算。 768
            ])
            for i in p.items():
                print(i)

        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        global PATH
        [(user_num, item_num), (tra_buys, tes_buys)] = \
            load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'])
        # 正样本加masks
        tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys,
                                                        tail=[item_num
                                                              ])  # 预测时算用户表达用
        tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys,
                                                        tail=[item_num
                                                              ])  # 预测时用
        # 负样本加masks
        tra_buys_neg_masks = fun_random_neg_masks_tra(
            item_num, tra_buys_masks)  # 训练时用(逐条、mini-batch均可)
        tes_buys_neg_masks = fun_random_neg_masks_tes(item_num, tra_buys_masks,
                                                      tes_buys_masks)  # 预测时用

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num = user_num, item_num
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
Esempio n. 2
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前90%分为8:1用作train和valid,来选择超参数, 不用去管剩下的10%.
            把前90%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            t = 't'                       # 写1就是valid, 写0就是test
            assert 't' == t or 'v' == t   # no other case
            p = OrderedDict(
                [
                    ('dataset',             'user_buys.txt'),
                    ('mode',                'test' if 't' == t else 'valid'),
                    ('split',               [0.8, 1.0] if 't' == t else [0.6, 0.8]),   # no third case
                    ('at_nums',             [5, 10, 15, 20]),
                    ('epochs',              100),

                    ('latent_size',         20),
                    ('alpha',               0.01),
                    ('lambda',              0.001),

                    ('set_len',             2),     # 每时刻建模几个item
                    ('layer',               5),     # ResNet的层数

                    ('mini_batch',          1),     # 0:one_by_one, 1:mini_batch
                    ('marank',              0),     # 0:MARank

                    ('batch_size_train',    4),    # 先试试16
                    ('batch_size_test',     768),   # user * item 矩阵太大了,分成多次计算。 768
                ])
            for i in p.items():
                print(i)
        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        global PATH
        [(user_num, item_num), (tra_buys, tes_buys)] = \
            load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'])
        # 正样本加masks
        tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys, tail=[item_num])          # 预测时算用户表达用
        tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys, tail=[item_num])          # 预测时用
        # 负样本加masks
        tra_buys_neg_masks = fun_random_neg_masks_tra(item_num, tra_buys_masks)   # 训练时用(逐条、mini-batch均可)
        tes_buys_neg_masks = fun_random_neg_masks_tes(item_num, tra_buys_masks, tes_buys_masks)   # 预测时用
        # 训练序列给做成集合的形式,每个usr在每个时刻都是处理多个items。做成mask形式,predict要用。
        tra_set_masks = fun_tra_set(tra_buys_masks, tail=[item_num], set_len=p['set_len'])

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num = user_num, item_num
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
        self.tra_set_masks = tra_set_masks
Esempio n. 3
0
    def __init__(self, p=None):
        if not p:
            t = 't'
            assert 't' == t or 'v' == t or 's' == t
            p = OrderedDict(
                [
                    ('dataset',             'Foursquare.txt'),
                    # ('dataset',             'Gowalla.txt'),
                    ('mode',                'test' if 't' == t else 'valid' if 'v' == t else 's'),
                    ('load_epoch',          0),
                    ('save_per_epoch',      100),
                    ('split',               -2 if 'v' == t else -1),
                    ('at_nums',             [5, 10, 15, 20]),
                    ('epochs',              101),

                    ('latent_size',         20),
                    ('alpha',               0.01),
                    ('lambda',              0.001),
                    ('loss_weight',         [0.5, 0.5]),

                    ('dd',                  200),
                    ('UD',                  40),

                    ('mini_batch',          0),
                    ('gru',                 0),

                    ('batch_size_train',    1),
                    ('batch_size_test',     32),
                ])
            for i in p.items():
                print(i)
        dist_num = int(p['UD'] * 1000 / p['dd'])    # 1520 = 38*1000/25。idx=[0, 1519+1]

        [(user_num, item_num), pois_cordis, (tra_buys, tes_buys), (tra_dist, tes_dist)] = \
            load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'], p['dd'], dist_num)
        tra_buys_masks, tra_dist_masks, tra_masks = fun_data_buys_masks(tra_buys, tra_dist, [item_num], [dist_num])
        tes_buys_masks, tes_dist_masks, tes_masks = fun_data_buys_masks(tes_buys, tes_dist, [item_num], [dist_num])
        tra_buys_neg_masks = fun_random_neg_masks_tra(item_num, tra_buys_masks)
        tes_buys_neg_masks = fun_random_neg_masks_tes(item_num, tra_buys_masks, tes_buys_masks)
        tra_dist_neg_masks = fun_compute_dist_neg(tra_buys_masks, tra_masks, tra_buys_neg_masks, pois_cordis, p['dd'], dist_num)
        usrs_last_poi_to_all_intervals = fun_compute_distance(tra_buys_masks, tra_masks, pois_cordis, p['dd'], dist_num)

        self.p = p
        self.user_num, self.item_num, self.dist_num = user_num, item_num, dist_num
        self.pois_cordis = pois_cordis
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
        self.tra_dist_masks = tra_dist_masks
        self.tes_dist_masks = tes_dist_masks
        self.tra_dist_neg_masks = tra_dist_neg_masks
        self.ulptai = usrs_last_poi_to_all_intervals
Esempio n. 4
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前90%分为8:1用作train和valid,来选择超参数, 不用去管剩下的10%.
            把前90%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            t = 't'  # 写1就是valid, 写0就是test
            assert 't' == t or 'v' == t or 's' == t  # no other case
            p = OrderedDict([
                ('dataset', 'Foursquare.txt'),
                # ('dataset',             'Gowalla.txt'),
                ('mode', 'test' if 't' == t else 'valid' if 'v' == t else 's'),
                ('load_epoch', 0),
                ('save_per_epoch', 100),
                ('split', -2 if 'v' == t else -1),  # test预测最后一个。
                ('at_nums', [5, 10, 15, 20]),
                ('epochs', 101),
                ('latent_size', 20),
                ('alpha', 0.01),
                ('lambda', 0.001),
                ('loss_weight', [0.5, 0.5]),
                ('dd', 200),  # 25m
                ('UD', 40),  # 截断距离38km,lambda_s的维度。
                ('mini_batch', 0),  # 0:one_by_one, 1:mini_batch. 全都用逐条。
                ('gru', 3),  # 0:bpr, 1:gru, 2:spatial-gru, 3: ca-rnn
                ('batch_size_train', 1),  #
                ('batch_size_test', 32),  # user * item 矩阵太大了,分成多次计算。 768
            ])
            for i in p.items():
                print(i)
        dist_num = int(p['UD'] * 1000 /
                       p['dd'])  # 1520 = 38*1000/25。idx=[0, 1519+1]

        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        [(user_num, item_num), pois_cordis, (tra_buys, tes_buys), (tra_dist, tes_dist)] = \
            load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'], p['dd'], dist_num)
        # 正样本加masks
        tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys,
                                                        tail=[item_num
                                                              ])  # 预测时算用户表达用
        tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys,
                                                        tail=[item_num
                                                              ])  # 预测时用
        tra_dist_masks, _ = fun_data_buys_masks(tra_dist, tail=[dist_num])
        tes_dist_masks, _ = fun_data_buys_masks(tes_dist, tail=[dist_num])
        # 负样本加masks
        tra_buys_neg_masks = fun_random_neg_masks_tra(
            item_num, tra_buys_masks)  # 训练时用(逐条、mini-batch均可)
        tes_buys_neg_masks = fun_random_neg_masks_tes(item_num, tra_buys_masks,
                                                      tes_buys_masks)  # 预测时用
        # 计算负样本与上一个正样本的距离间隔,并加masks
        tra_dist_neg_masks = fun_compute_dist_neg(tra_buys_masks, tra_masks,
                                                  tra_buys_neg_masks,
                                                  pois_cordis, p['dd'],
                                                  dist_num)
        # 每个user最后一个poi和all pois的距离落在哪个区间里。
        usrs_last_poi_to_all_intervals = fun_compute_distance(
            tra_buys_masks, tra_masks, pois_cordis, p['dd'], dist_num)

        # print(tra_dist[0][:5])        # [1520, 274, 0, 428, 142], 38km/25m=1520

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num, self.dist_num = user_num, item_num, dist_num
        self.pois_cordis = pois_cordis
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
        self.tra_dist_masks = tra_dist_masks
        self.tes_dist_masks = tes_dist_masks
        self.tra_dist_neg_masks = tra_dist_neg_masks
        self.ulptai = usrs_last_poi_to_all_intervals
Esempio n. 5
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前90%分为8:1用作train和valid,来选择超参数, 不用去管剩下的10%.
            把前90%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            t = 't'                       # 写1就是valid, 写0就是test
            assert 't' == t or 'v' == t   # no other case
            p = OrderedDict(
                [
                    ('dataset',             'sub_users5_items5.txt'),
                    ('mode',                'test' if 't' == t else 'valid'),

                    ('split',               -1 if 't' == t else -2),   # test预测最后一个。
                    ('at_nums',             [5, 10, 15, 20]),
                    ('epochs',              100),

                    ('latent_size',         20),
                    ('alpha',               0.01),
                    ('lambda',              0.001),

                    ('UD',                  20 if 'Foursquare' in PATH else 50),    # 截断距离20km。

                    ('mini_batch',          0),     # 0:one_by_one, 全都用逐条。

                    ('batch_size_train',    1),     #
                    ('batch_size_test',     128),   # user * item 矩阵太大了,分成多次计算。 768
                ])
            for i in p.items():
                print(i)

        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        [(user_num, item_num), pois_cordis, (tra_pois, tes_pois), tra_last_poi] = \
            load_data(os.path.join(PATH, p['dataset']), p['mode'], p['split'])
        # tes加masks
        tes_pois_masks, tes_masks = fun_data_buys_masks(tes_pois, [item_num])
        tes_pois_neg_masks = fun_random_neg_masks_tes(item_num, tra_pois, tes_pois_masks)   # 预测时用
        # tra,因为各位置的负样本数目都不等,所以tra就不加masks了。
        # 训练时的负样本,构建字典:计算每个poi与所有pois的距离,并只保留截断距离内的pois。

        t0 = time.time()
        all_pois_neighbors = fun_acquire_neighbors_for_each_poi(pois_cordis, p['UD'])
        neis = [len(nei) for nei in all_pois_neighbors.values()]
        print(min(neis), sum(neis) / item_num)  # 5/10km时,最少5/170,平均1640/3183。20/50km时,最少0/0,平均2712/5168
        t1 = time.time()
        print(t1 - t0)  # 30s/30s。560s/560s

        # 从字典中索引,得到负样本。t-1时刻的a -> i(t时刻的正样本),-> j(t时刻的负样本)。
        # j距离i在一定距离范围内,且j不能是i。
        tra_pois_negs = fun_acquire_negs_tra(tra_pois, all_pois_neighbors, item_num)

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num = user_num, item_num
        self.pois_cordis = pois_cordis
        self.tra_pois = tra_pois
        self.tra_pois_negs = tra_pois_negs
        self.tra_last_poi = tra_last_poi
        self.tes_pois_masks, self.tes_masks = tes_pois_masks, tes_masks
        self.tes_pois_neg_masks = tes_pois_neg_masks