def load_data(self): self.data = self.dp.train_data + self.dp.test_data nb_train = len(self.dp.train_data) nb_non_train = len(self.dp.test_data) nb_vali = nb_non_train // 3 nb_test = nb_non_train - nb_vali nb_data = len(self.data) assert nb_data > 0 args.update(nb_data=nb_data, nb_train=nb_train, nb_vali=nb_vali, nb_test=nb_test)
def __init__(self, ds, adj_length, seq_length): self.ds = ds # self.adj_length = adj_length # self.seq_length = seq_length self.vid2node = {} self.vid2node['[MASK]'] = 0 self.DR = DatasetReader(ds) self.G_in, self.G_out, self.train_data, self.test_data = self.build_graph( seq_length) rdm = np.random.RandomState(777) rdm.shuffle(self.train_data) rdm = np.random.RandomState(333) rdm.shuffle(self.test_data) args.update(nb_nodes=len(self.vid2node)) args.update(nb_edges_0=self.G_in[0].nb_edges()) args.update(nb_edges_1=self.G_in[1].nb_edges()) self.adj_in_0 = self.build_adj(self.G_in[0], adj_length) self.adj_out_0 = self.build_adj(self.G_out[0], adj_length) self.adj_in_1 = self.build_adj(self.G_in[1], adj_length) self.adj_out_1 = self.build_adj(self.G_out[1], adj_length) self.adjs_tmp = [ self.adj_in_0, self.adj_out_0, self.adj_in_1, self.adj_out_1 ] self.adjs = [a[0] for a in self.adjs_tmp]
def build_graph_item_item(self): from tqdm import tqdm G_forward = [defaultdict(int) for _ in range(args.nb_items)] G_backward = [defaultdict(int) for _ in range(args.nb_items)] nb_edges = 0 for u, item_list in tqdm(enumerate(self.user2item_seq), desc='build edges'): n = len(item_list) for i in range(1, n): a, b = item_list[i - 1], item_list[i] if a >= 3 and b >= 3: G_forward[a][b] += 1 G_backward[b][a] += 1 if G_forward[a][b] == args.gnn_min_edge_cnt: nb_edges += 1 args.update(nb_edges=nb_edges) neighbors = [[], []] maxn = args.gnn_adj_length for item in tqdm(range(args.nb_items), desc='sample neighbors'): nxt_forward = self.sample_neighbors(G_forward[item], maxn) nxt_backward = self.sample_neighbors(G_backward[item], maxn) neighbors[0].append(nxt_forward) neighbors[1].append(nxt_backward) self.neighbors = neighbors
def main(): print('hello world, dataset.py') import main as main_md args.update(**main_md.parse_args()) args.update(ds='v3s2') test()
def __init__(self): self.home = f'{data_home}/{args.ds}' metadata = self.load_json('metadata') args.update(**metadata) tmp = [6, 11, 15, 18, 22, 24, 26, 29, 32, 33] if args.ds == 'v8': tmp = tmp[:7] global NUM_PHASE NUM_PHASE = 7 # tmp = [t + 2 for t in tmp] metadata['mid_deg_per_phase'] = tmp self.mid_deg_per_phase = metadata['mid_deg_per_phase'] args.update(mid_deg_per_phase=str(self.mid_deg_per_phase)) self.user2item_seq = self.load_json('user2item_seq') self.user2ts_seq = self.load_json('user2ts_seq') self.vali_puiqa = self.load_json('vali_puiqa') self.test_puiqa = self.load_json('test_puiqa') # uids_list, vids_list self.raw_id_list = self.load_json('ids_list') try: self.item_feat = np.load(f'{self.home}/item_feat.npy') except Exception as e: print(e) self.item_feat = np.zeros([args.nb_items, 2, 128]) self.item_deg_per_phase = self.load_json('item_deg') self.item_deg_per_phase = np.array(self.item_deg_per_phase, dtype=int) self.item_deg_self_per_phase = self.load_json('item_deg_self') self.item_deg_self_per_phase = np.array(self.item_deg_self_per_phase, dtype=int) self.item_is_half = self.item_deg_per_phase <= np.array( self.mid_deg_per_phase)[:, None] self.score_mask_per_phase = np.ones([NUM_PHASE, args.nb_items], dtype=int) self.score_mask_per_phase[self.item_deg_self_per_phase == 0] = 0 self.vali_user2ppa = {} for phase, user, pos, ts, ans in self.vali_puiqa: self.vali_user2ppa[user] = (phase, pos, ans) self.test_user2ppa = {} for phase, user, pos, ts, ans in self.test_puiqa: self.test_user2ppa[user] = (phase, pos, ans) self.train_users = list(range(args.nb_users)) self.vali_users = sorted(self.vali_user2ppa.keys()) self.test_users = sorted(self.test_user2ppa.keys()) self.item_pop = [0] * args.nb_items self.user_pop = [0] * args.nb_users self.for_phase = -1 if len(args.mode_pred_phase) == 1: self.for_phase = int(args.mode_pred_phase) args.update(for_phase=self.for_phase) nb_train = 0 for user, item_list in enumerate(self.user2item_seq): for i, item in enumerate(item_list): if item == 1: vali_phase, vali_pos, vali_ans = self.vali_user2ppa[user] if args.use_unused_vali and args.mode_pred_phase != 'all' and str( vali_phase) not in args.mode_pred_phase: item = vali_ans if item >= 3: self.item_pop[item] += 1 self.user_pop[user] += 1 nb_train += 1 if args.mode_resample != 'none': if args.mode_resample == 'user': if user in self.vali_user2ppa or user in self.test_user2ppa: nb_train += args.alpha_resample elif args.mode_resample == 'rare': # assert len(args.mode_pred_phase) == 1 # assert phase == int(args.mode_pred_phase) phase = int(args.mode_pred_phase) item_deg = self.item_deg_per_phase[phase][item] mid_deg = self.mid_deg_per_phase[phase] if item_deg <= mid_deg: nb_train += args.alpha_resample elif args.mode_resample == 'phase': phase = int(args.mode_pred_phase) q_ts = self.user2ts_seq[user][i] unit = 54.5 min_ts = phase * unit max_ts = min_ts + (unit * 4) + 1 if min_ts <= q_ts < max_ts: nb_train += args.alpha_resample elif args.mode_resample == 'day': phase = int(args.mode_pred_phase) q_ts = self.user2ts_seq[user][i] unit = 54.5 min_ts = 10.0 + (phase + 3) * unit max_ts = min_ts + unit + 1 if min_ts <= q_ts < max_ts: nb_train += args.alpha_resample for item in range(3): self.item_pop[item] += 1 self.item_pop = np.array(self.item_pop, dtype=int) self.item_pop[self.item_pop == 0] = 1 assert np.min(self.item_pop) >= 1 self.item_pop_log = np.log(self.item_pop + np.e) self.item_pop_inv = 1.0 / self.item_pop self.item_pop_log_inv = 1.0 / self.item_pop_log self.user_pop = np.array(self.user_pop, dtype=int) args.update(nb_train=nb_train) if args.gnn: pass if args.get('run_data', False) and args.run_test: pass
def fuse(ans_list, weight_list, name): users = sorted(ans_list[0].keys()) user2tops = {} global max_score_per_ans, score_diff_per_ans if max_score_per_ans is None: max_score_per_ans = [] score_diff_per_ans = [] for i in range(len(ans_list)): max_score, min_score = None, None for user in tqdm(users, desc='count min max', ncols=90, ascii=True): logits_dict = ans_list[i][user] for item, score in logits_dict.items(): if max_score is None: max_score = score min_score = score else: max_score = max(max_score, score) min_score = min(min_score, score) print(f'ans {i}, min ~ max: {min_score} ~ {max_score}') score_diff = max_score - min_score max_score_per_ans.append(max_score) score_diff_per_ans.append(score_diff) if name == 'vali': user2ppa = data.dp.vali_user2ppa else: user2ppa = data.dp.test_user2ppa is_first_fuse = args.get('is_first_fuse', True) for user in tqdm(users, desc='fuse', ncols=90, ascii=True, leave=is_first_fuse): phase, pos, ans = user2ppa[user] final_ans = defaultdict(float) for ans_i, (ans, weight) in enumerate(zip(ans_list, weight_list)): if weight < 1e-6: continue logits_dict = ans[user] for item, score in logits_dict.items(): pop_inv = data.dp.item_pop_inv[item] pop_log_inv = data.dp.item_pop_log_inv[item] s = score / score_diff_per_ans[ans_i] if args.mode_pop == 'log': s *= (1.0 + pop_log_inv) if args.mode_rare == 'linear': if data.dp.item_is_half[phase, item] and alpha_rare[ans_i] > 0: s *= (alpha_rare[ans_i] + pop_inv) w = weight final_ans[item] += w * s tops = sorted(final_ans.keys(), key=lambda _item: (-final_ans[_item], _item))[:args.nb_topk] user2tops[user] = tops args.update(is_first_fuse=False) args.update(show_detail=False) return user2tops
def main(): args.update(mmmmain.parse_args()) args.update(ds='v3') run()
def main(**main_args): begin_time = time.time() # init args args.update(**main_args) command_line_args = parse_args() args.setdefault(**command_line_args) args.update(run_on_yard=True) seed = args.seed os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) # get Model, set model default args Model = vars(models)[args.model] args.setdefault(**Model.args.vars()) if args.run_test: args.update(epochs=2, nb_vali_step=2, max_data_line=100) print(args) # get data random.seed(seed) np.random.seed(args.seed) data = dataset.Data() min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step) if min_epochs < 1.0: args.update(nb_vali_step=int(np.ceil(args.nb_train / args.batch_size))) print(args) min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step) args.update(min_epochs=int(np.ceil(min_epochs))) # args.setdefault()) # run_name: time-x-Modes-ds time_str = utils.get_time_str() model_name = Model.__name__ run_name = f'{time_str}-{model_name}-{args.ds}' if args.msg: run_name = f'{run_name}-{args.msg}' if args.run_test: run_name = f'{run_name}-test' args.update(run_name=run_name) T = Train.Train(Model, data) log_fn = f'{utils.log_dir}/{run_name}.log' begin_time_str = utils.get_time_str() print(begin_time_str, log_fn, '----- start!, pid:', os.getpid()) args.update(pid=os.getpid()) log = utils.Logger(fn=log_fn, verbose=args.verbose) args.update(log=log) args.log.log(f'argv: {" ".join(sys.argv)}') args.log.log(f'log_fn: {log_fn}') args.log.log(f'args: {args.prt_json()}') args.log.log(f'Model: {model_name}') args.log.log(f'begin time: {begin_time_str}') try: T.train() except KeyboardInterrupt as e: if not T.has_train: raise e test_str = T.final_test() args.log.log(f'\ntest: {test_str}\n', red=True) args.log.log(log_fn) dt = time.time() - begin_time end_time_str = utils.get_time_str() args.log.log(f'end time: {end_time_str}, dt: {dt / 3600:.2f}h') print(end_time_str, log_fn, f'##### over, time: {dt / 3600:.2f}h')
def main(**main_args): begin_time = time.time() # init args args.clear() command_line_args = parse_args() args.update(**command_line_args) args.update(**main_args) if args.ds == 'test': args.update(run_test=True) seed = args.seed os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) # get Model, set model default args Model = vars(models)[args.model] args.update(**Model.args) args.update(**main_args) if args.run_test: args.update(epochs=2, nb_vali_step=2, batch_size=4) # get data data = dataset.Data() min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step) if min_epochs < 1.0: args.update(nb_vali_step=int(np.ceil(args.nb_train / args.batch_size))) min_epochs = args.nb_train / (args.batch_size * args.nb_vali_step) args.update(min_epochs=int(np.ceil(min_epochs))) # run_name: time-x-Model-ds model_name = Model.__name__ time_str = utils.get_time_str() run_name = f'{time_str}-{model_name}-{args.ds}' if args.msg: run_name = f'{run_name}-{args.msg}' if args.run_test: run_name = f'{run_name}-test' if args.restore_model: run_name = f'{run_name}-restored' args.update(run_name=run_name) log_fn = f'{utils.log_dir}/{run_name}.log' begin_time_str = utils.get_time_str() args.update(pid=os.getpid()) log = utils.Logger(fn=log_fn, verbose=args.verbose) args.update(log=log) args.log.log(f'argv: {" ".join(sys.argv)}') args.log.log(f'log_fn: {log_fn}') args.log.log(f'main_args: {utils.Object(**main_args).json()}') args.log.log(f'args: {args.json()}') args.log.log(f'Model: {model_name}') args.log.log(f'begin time: {begin_time_str}') T = Train.Train(Model, data) if args.restore_model: T.model.restore_from_other(args.restore_model) if not args.restore_model or args.restore_train: try: T.train() except KeyboardInterrupt as e: pass T.model.restore(0) if args.skip_vali: test_str = 'none' else: test_str = T.final_test() args.log.log(f'vali: {test_str}', red=True) if args.dump_all: T.dump_features_all_item('vali') T.dump_features_all_item('test') return if args.dump: T.dump_features('vali') T.dump_features('test') return args.log.log(run_name, red=True) if args.restore_model: args.log.log(f'restored from {args.restore_model}', red=True) dt = time.time() - begin_time end_time_str = utils.get_time_str() args.log.log(f'end time: {end_time_str}, dt: {dt / 3600:.2f}h') print(end_time_str, log_fn, f'##### over, time: {dt / 3600:.2f}h') return test_str, time_str