Beispiel #1
0
    def load_data(self):
        self.data = self.dp.train_data + self.dp.test_data
        nb_train = len(self.dp.train_data)
        nb_non_train = len(self.dp.test_data)
        nb_vali = nb_non_train // 3
        nb_test = nb_non_train - nb_vali

        nb_data = len(self.data)
        assert nb_data > 0
        args.update(nb_data=nb_data,
                    nb_train=nb_train,
                    nb_vali=nb_vali,
                    nb_test=nb_test)
Beispiel #2
0
    def __init__(self, ds, adj_length, seq_length):
        self.ds = ds
        # self.adj_length = adj_length
        # self.seq_length = seq_length

        self.vid2node = {}
        self.vid2node['[MASK]'] = 0

        self.DR = DatasetReader(ds)
        self.G_in, self.G_out, self.train_data, self.test_data = self.build_graph(
            seq_length)

        rdm = np.random.RandomState(777)
        rdm.shuffle(self.train_data)

        rdm = np.random.RandomState(333)
        rdm.shuffle(self.test_data)

        args.update(nb_nodes=len(self.vid2node))
        args.update(nb_edges_0=self.G_in[0].nb_edges())
        args.update(nb_edges_1=self.G_in[1].nb_edges())

        self.adj_in_0 = self.build_adj(self.G_in[0], adj_length)
        self.adj_out_0 = self.build_adj(self.G_out[0], adj_length)
        self.adj_in_1 = self.build_adj(self.G_in[1], adj_length)
        self.adj_out_1 = self.build_adj(self.G_out[1], adj_length)

        self.adjs_tmp = [
            self.adj_in_0, self.adj_out_0, self.adj_in_1, self.adj_out_1
        ]
        self.adjs = [a[0] for a in self.adjs_tmp]
 def build_graph_item_item(self):
     from tqdm import tqdm
     G_forward = [defaultdict(int) for _ in range(args.nb_items)]
     G_backward = [defaultdict(int) for _ in range(args.nb_items)]
     nb_edges = 0
     for u, item_list in tqdm(enumerate(self.user2item_seq),
                              desc='build edges'):
         n = len(item_list)
         for i in range(1, n):
             a, b = item_list[i - 1], item_list[i]
             if a >= 3 and b >= 3:
                 G_forward[a][b] += 1
                 G_backward[b][a] += 1
                 if G_forward[a][b] == args.gnn_min_edge_cnt:
                     nb_edges += 1
     args.update(nb_edges=nb_edges)
     neighbors = [[], []]
     maxn = args.gnn_adj_length
     for item in tqdm(range(args.nb_items), desc='sample neighbors'):
         nxt_forward = self.sample_neighbors(G_forward[item], maxn)
         nxt_backward = self.sample_neighbors(G_backward[item], maxn)
         neighbors[0].append(nxt_forward)
         neighbors[1].append(nxt_backward)
     self.neighbors = neighbors
def main():
    print('hello world, dataset.py')
    import main as main_md
    args.update(**main_md.parse_args())
    args.update(ds='v3s2')
    test()
    def __init__(self):
        self.home = f'{data_home}/{args.ds}'
        metadata = self.load_json('metadata')
        args.update(**metadata)

        tmp = [6, 11, 15, 18, 22, 24, 26, 29, 32, 33]
        if args.ds == 'v8':
            tmp = tmp[:7]
            global NUM_PHASE
            NUM_PHASE = 7
        # tmp = [t + 2 for t in tmp]
        metadata['mid_deg_per_phase'] = tmp

        self.mid_deg_per_phase = metadata['mid_deg_per_phase']
        args.update(mid_deg_per_phase=str(self.mid_deg_per_phase))

        self.user2item_seq = self.load_json('user2item_seq')
        self.user2ts_seq = self.load_json('user2ts_seq')
        self.vali_puiqa = self.load_json('vali_puiqa')
        self.test_puiqa = self.load_json('test_puiqa')
        # uids_list, vids_list
        self.raw_id_list = self.load_json('ids_list')

        try:
            self.item_feat = np.load(f'{self.home}/item_feat.npy')
        except Exception as e:
            print(e)
            self.item_feat = np.zeros([args.nb_items, 2, 128])

        self.item_deg_per_phase = self.load_json('item_deg')
        self.item_deg_per_phase = np.array(self.item_deg_per_phase, dtype=int)
        self.item_deg_self_per_phase = self.load_json('item_deg_self')
        self.item_deg_self_per_phase = np.array(self.item_deg_self_per_phase,
                                                dtype=int)

        self.item_is_half = self.item_deg_per_phase <= np.array(
            self.mid_deg_per_phase)[:, None]

        self.score_mask_per_phase = np.ones([NUM_PHASE, args.nb_items],
                                            dtype=int)
        self.score_mask_per_phase[self.item_deg_self_per_phase == 0] = 0

        self.vali_user2ppa = {}
        for phase, user, pos, ts, ans in self.vali_puiqa:
            self.vali_user2ppa[user] = (phase, pos, ans)

        self.test_user2ppa = {}
        for phase, user, pos, ts, ans in self.test_puiqa:
            self.test_user2ppa[user] = (phase, pos, ans)

        self.train_users = list(range(args.nb_users))
        self.vali_users = sorted(self.vali_user2ppa.keys())
        self.test_users = sorted(self.test_user2ppa.keys())

        self.item_pop = [0] * args.nb_items
        self.user_pop = [0] * args.nb_users

        self.for_phase = -1
        if len(args.mode_pred_phase) == 1:
            self.for_phase = int(args.mode_pred_phase)

        args.update(for_phase=self.for_phase)

        nb_train = 0
        for user, item_list in enumerate(self.user2item_seq):
            for i, item in enumerate(item_list):
                if item == 1:
                    vali_phase, vali_pos, vali_ans = self.vali_user2ppa[user]
                    if args.use_unused_vali and args.mode_pred_phase != 'all' and str(
                            vali_phase) not in args.mode_pred_phase:
                        item = vali_ans

                if item >= 3:
                    self.item_pop[item] += 1
                    self.user_pop[user] += 1
                    nb_train += 1
                    if args.mode_resample != 'none':
                        if args.mode_resample == 'user':
                            if user in self.vali_user2ppa or user in self.test_user2ppa:
                                nb_train += args.alpha_resample
                        elif args.mode_resample == 'rare':
                            # assert len(args.mode_pred_phase) == 1
                            # assert phase == int(args.mode_pred_phase)
                            phase = int(args.mode_pred_phase)
                            item_deg = self.item_deg_per_phase[phase][item]
                            mid_deg = self.mid_deg_per_phase[phase]
                            if item_deg <= mid_deg:
                                nb_train += args.alpha_resample
                        elif args.mode_resample == 'phase':
                            phase = int(args.mode_pred_phase)
                            q_ts = self.user2ts_seq[user][i]
                            unit = 54.5
                            min_ts = phase * unit
                            max_ts = min_ts + (unit * 4) + 1
                            if min_ts <= q_ts < max_ts:
                                nb_train += args.alpha_resample
                        elif args.mode_resample == 'day':
                            phase = int(args.mode_pred_phase)
                            q_ts = self.user2ts_seq[user][i]
                            unit = 54.5
                            min_ts = 10.0 + (phase + 3) * unit
                            max_ts = min_ts + unit + 1
                            if min_ts <= q_ts < max_ts:
                                nb_train += args.alpha_resample

        for item in range(3):
            self.item_pop[item] += 1

        self.item_pop = np.array(self.item_pop, dtype=int)
        self.item_pop[self.item_pop == 0] = 1
        assert np.min(self.item_pop) >= 1
        self.item_pop_log = np.log(self.item_pop + np.e)

        self.item_pop_inv = 1.0 / self.item_pop
        self.item_pop_log_inv = 1.0 / self.item_pop_log

        self.user_pop = np.array(self.user_pop, dtype=int)
        args.update(nb_train=nb_train)

        if args.gnn:
            pass
        if args.get('run_data', False) and args.run_test:
            pass
Beispiel #6
0
def fuse(ans_list, weight_list, name):
    users = sorted(ans_list[0].keys())
    user2tops = {}

    global max_score_per_ans, score_diff_per_ans
    if max_score_per_ans is None:
        max_score_per_ans = []
        score_diff_per_ans = []
        for i in range(len(ans_list)):
            max_score, min_score = None, None
            for user in tqdm(users, desc='count min max', ncols=90,
                             ascii=True):
                logits_dict = ans_list[i][user]

                for item, score in logits_dict.items():
                    if max_score is None:
                        max_score = score
                        min_score = score
                    else:
                        max_score = max(max_score, score)
                        min_score = min(min_score, score)

            print(f'ans {i}, min ~ max: {min_score} ~ {max_score}')

            score_diff = max_score - min_score
            max_score_per_ans.append(max_score)
            score_diff_per_ans.append(score_diff)

    if name == 'vali':
        user2ppa = data.dp.vali_user2ppa
    else:
        user2ppa = data.dp.test_user2ppa

    is_first_fuse = args.get('is_first_fuse', True)
    for user in tqdm(users,
                     desc='fuse',
                     ncols=90,
                     ascii=True,
                     leave=is_first_fuse):
        phase, pos, ans = user2ppa[user]
        final_ans = defaultdict(float)
        for ans_i, (ans, weight) in enumerate(zip(ans_list, weight_list)):
            if weight < 1e-6:
                continue

            logits_dict = ans[user]
            for item, score in logits_dict.items():
                pop_inv = data.dp.item_pop_inv[item]
                pop_log_inv = data.dp.item_pop_log_inv[item]
                s = score / score_diff_per_ans[ans_i]
                if args.mode_pop == 'log':
                    s *= (1.0 + pop_log_inv)

                if args.mode_rare == 'linear':
                    if data.dp.item_is_half[phase,
                                            item] and alpha_rare[ans_i] > 0:
                        s *= (alpha_rare[ans_i] + pop_inv)

                w = weight

                final_ans[item] += w * s

        tops = sorted(final_ans.keys(),
                      key=lambda _item:
                      (-final_ans[_item], _item))[:args.nb_topk]
        user2tops[user] = tops

    args.update(is_first_fuse=False)
    args.update(show_detail=False)
    return user2tops
Beispiel #7
0
def main():
    args.update(mmmmain.parse_args())
    args.update(ds='v3')
    run()
Beispiel #8
0
def main(**main_args):
    begin_time = time.time()

    # init args
    args.update(**main_args)
    command_line_args = parse_args()
    args.setdefault(**command_line_args)

    args.update(run_on_yard=True)

    seed = args.seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

    # get Model, set model default args
    Model = vars(models)[args.model]
    args.setdefault(**Model.args.vars())

    if args.run_test:
        args.update(epochs=2, nb_vali_step=2, max_data_line=100)

    print(args)

    # get data
    random.seed(seed)
    np.random.seed(args.seed)
    data = dataset.Data()
    min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step)
    if min_epochs < 1.0:
        args.update(nb_vali_step=int(np.ceil(args.nb_train / args.batch_size)))
        print(args)
        min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step)
    args.update(min_epochs=int(np.ceil(min_epochs)))
    # args.setdefault())

    # run_name: time-x-Modes-ds
    time_str = utils.get_time_str()
    model_name = Model.__name__
    run_name = f'{time_str}-{model_name}-{args.ds}'
    if args.msg:
        run_name = f'{run_name}-{args.msg}'
    if args.run_test:
        run_name = f'{run_name}-test'

    args.update(run_name=run_name)
    T = Train.Train(Model, data)

    log_fn = f'{utils.log_dir}/{run_name}.log'
    begin_time_str = utils.get_time_str()
    print(begin_time_str, log_fn, '----- start!, pid:', os.getpid())
    args.update(pid=os.getpid())
    log = utils.Logger(fn=log_fn, verbose=args.verbose)
    args.update(log=log)
    args.log.log(f'argv: {" ".join(sys.argv)}')
    args.log.log(f'log_fn: {log_fn}')
    args.log.log(f'args: {args.prt_json()}')
    args.log.log(f'Model: {model_name}')
    args.log.log(f'begin time: {begin_time_str}')

    try:
        T.train()
    except KeyboardInterrupt as e:
        if not T.has_train:
            raise e
    test_str = T.final_test()

    args.log.log(f'\ntest: {test_str}\n', red=True)

    args.log.log(log_fn)
    dt = time.time() - begin_time
    end_time_str = utils.get_time_str()
    args.log.log(f'end time: {end_time_str}, dt: {dt / 3600:.2f}h')
    print(end_time_str, log_fn, f'##### over, time: {dt / 3600:.2f}h')
Beispiel #9
0
def main(**main_args):
    begin_time = time.time()

    # init args
    args.clear()
    command_line_args = parse_args()
    args.update(**command_line_args)
    args.update(**main_args)

    if args.ds == 'test':
        args.update(run_test=True)

    seed = args.seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)


    # get Model, set model default args
    Model = vars(models)[args.model]
    args.update(**Model.args)
    args.update(**main_args)

    if args.run_test:
        args.update(epochs=2, nb_vali_step=2, batch_size=4)

    # get data
    data = dataset.Data()
    min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step)

    if min_epochs < 1.0:
        args.update(nb_vali_step=int(np.ceil(args.nb_train / args.batch_size)))
        min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step)
    args.update(min_epochs=int(np.ceil(min_epochs)))

    # run_name: time-x-Model-ds
    model_name = Model.__name__
    time_str = utils.get_time_str()
    run_name = f'{time_str}-{model_name}-{args.ds}'

    if args.msg:
        run_name = f'{run_name}-{args.msg}'
    if args.run_test:
        run_name = f'{run_name}-test'
    if args.restore_model:
        run_name = f'{run_name}-restored'

    args.update(run_name=run_name)

    log_fn = f'{utils.log_dir}/{run_name}.log'
    begin_time_str = utils.get_time_str()
    args.update(pid=os.getpid())
    log = utils.Logger(fn=log_fn, verbose=args.verbose)
    args.update(log=log)
    args.log.log(f'argv: {" ".join(sys.argv)}')
    args.log.log(f'log_fn: {log_fn}')
    args.log.log(f'main_args: {utils.Object(**main_args).json()}')
    args.log.log(f'args: {args.json()}')
    args.log.log(f'Model: {model_name}')
    args.log.log(f'begin time: {begin_time_str}')

    T = Train.Train(Model, data)
    if args.restore_model:
        T.model.restore_from_other(args.restore_model)
    if not args.restore_model or args.restore_train:
        try:
            T.train()
        except KeyboardInterrupt as e:
            pass
        T.model.restore(0)

    if args.skip_vali:
        test_str = 'none'
    else:
        test_str = T.final_test()
        args.log.log(f'vali: {test_str}', red=True)

    if args.dump_all:
        T.dump_features_all_item('vali')
        T.dump_features_all_item('test')
        return

    if args.dump:
        T.dump_features('vali')
        T.dump_features('test')
        return

    args.log.log(run_name, red=True)
    if args.restore_model:
        args.log.log(f'restored from {args.restore_model}', red=True)

    dt = time.time() - begin_time
    end_time_str = utils.get_time_str()
    args.log.log(f'end time: {end_time_str}, dt: {dt / 3600:.2f}h')
    print(end_time_str, log_fn, f'##### over, time: {dt / 3600:.2f}h')

    return test_str, time_str