def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'cold') u_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt') v_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_cold_file = os.path.join(split_folder, 'test.csv') test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') dat = {} # load preference data timer.tic() u_pref = np.loadtxt(u_file).reshape(n_users, 200) v_pref = np.loadtxt(v_file).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) item_content = tfidf(item_content) from sklearn.utils.extmath import randomized_svd u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s _, item_content = utils.prep_standardize(item_content) if sp.issparse(item_content): dat['item_content'] = item_content.tolil(copy=False) else: dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_cold'] = data.load_eval_data(test_cold_file, test_cold_iid_file, name='eval_cold', cold=True, train_data=train, citeu=True) return dat
def __init__(self, path, batch_size, split, max_L): self.batch_size = batch_size self.max_L = max_L self.split = split split_folder = os.path.join(path, 'cold') item_content_file = os.path.join(path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_cold_file = os.path.join(split_folder, 'test.csv') test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') # load split # timer.tic() train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values """将测试集的部分数据放入训练集""" if self.split: test_data = pd.read_csv('../data/CiteU/cold/test.csv', sep=',', header=None, dtype=np.int32).values before = test_data[0][1] item_set = [] for line in test_data: if line[1] != before: samples = rd.sample(item_set, min(len(item_set), split)) train = np.vstack((train, samples)) item_set = [] item_set.append(line) before = line[1] train = train.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) train_user_ids = np.unique(train['uid']) train_item_ids = np.unique(train['iid']) self.train_item_ids = train_item_ids train_user_ids_map = { user_id: i for i, user_id in enumerate(train_user_ids) } train_item_ids_map = { user_id: i for i, user_id in enumerate(train_item_ids) } _train_i_for_inf = [train_user_ids_map[_t[0]] for _t in train] _train_j_for_inf = [train_item_ids_map[_t[1]] for _t in train] self.R_train = sp.coo_matrix( (np.ones(len(_train_i_for_inf)), (_train_i_for_inf, _train_j_for_inf)), shape=[len(train_user_ids), len(train_item_ids)]).tolil(copy=False) """for transformer""" R_id = np.zeros((len(train_item_ids), self.max_L), dtype="int32") R_mask = np.zeros((len(train_item_ids), self.max_L), dtype="int32") R_line = [] """""" # R_id = np.full((len(train_item_ids), self.max_L), 5551, dtype="int32") # R_mask = np.full((len(train_item_ids), self.max_L), 5551, dtype="int32") pre_i = 0 for i in range(len(_train_i_for_inf)): if _train_j_for_inf[i] == pre_i: R_line.append(_train_i_for_inf[i]) else: R_sample = rd.sample(R_line, min(len(R_line), self.max_L)) for index in range(len(R_sample)): R_id[_train_j_for_inf[i - 1]][index] = R_sample[index] R_mask[_train_j_for_inf[i - 1]][index] = 1 R_line = [] pre_i = _train_j_for_inf[i] self.train_items = self.record_list(_train_j_for_inf, _train_i_for_inf) self.train_u = _train_i_for_inf self.train_i = _train_j_for_inf with open(test_cold_iid_file) as f: test_item_ids = [int(line) for line in f] self.test_item_ids = test_item_ids test_data = pd.read_csv(test_cold_file, delimiter=",", header=None, dtype=np.int32).values.ravel() test_data = test_data.view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) test_item_ids_map = {iid: i for i, iid in enumerate(test_item_ids)} _test_ij_for_inf = [(t[0], t[1]) for t in test_data if t[1] in test_item_ids_map] # test_user_ids test_user_ids = np.unique(test_data['uid']) # test_user_ids_map # test_user_ids_map = {user_id: i for i, user_id in enumerate(test_user_ids)} test_user_ids_map = train_user_ids_map _test_i_for_inf = [ test_user_ids_map[_t[0]] for _t in _test_ij_for_inf ] _test_j_for_inf = [ test_item_ids_map[_t[1]] for _t in _test_ij_for_inf ] self.R_test = sp.coo_matrix( (np.ones(len(_test_i_for_inf)), (_test_i_for_inf, _test_j_for_inf)), shape=[len(train_user_ids), len(test_item_ids)]).tolil(copy=False) self.test_items = self.record_list(_test_j_for_inf, _test_i_for_inf) self.test_u = _test_i_for_inf self.test_users = self.record_list(_test_i_for_inf, _test_j_for_inf) item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) item_content = tfidf(item_content) from sklearn.utils.extmath import randomized_svd u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s _, item_content = utils.prep_standardize(item_content) if sp.issparse(item_content): item_feature = item_content.tolil(copy=False) else: item_feature = item_content # timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() self.item_content = item_feature """加入item预训练模型""" # pretrain_v = pd.read_csv("../data/CiteU/trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt", # sep=' ', header=None).values # self.S_tr = pretrain_v[train_item_ids, :] # self.S_te = pretrain_v[test_item_ids, :] self.S_tr = item_feature[train_item_ids, :] self.S_te = item_feature[test_item_ids, :] if sp.issparse(self.S_tr): self.S_tr = self.S_tr.todense() self.S_te = self.S_te.todense() self.X_tr = self.R_train.todense().T self.R_id = R_id self.R_mask = R_mask self.U_tr = self.R_train.todense() self.X_te = self.R_test.todense().T self.U_te = self.R_test.todense() self.n_users = self.X_tr.shape[0] + self.X_te.shape[0] self.n_items = self.U_tr.shape[0]
def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'warm') u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') user_content_file = os.path.join(data_path, 'user_features_0based.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_warm_file = os.path.join(split_folder, 'test_warm.csv') test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') dat = {} # load preference data timer.tic() u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) dat['user_content'] = user_content.tolil(copy=False) timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) dat['item_content'] = item_content.tolil(copy=False) timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, train_data=train) dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', cold=True, train_data=train) dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', cold=True, train_data=train) return dat
def __init__(self, path, batch_size): self.batch_size = batch_size split_folder = os.path.join(path, 'cold') item_content_file = os.path.join(path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_cold_file = os.path.join(split_folder, 'test.csv') test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') # load split # timer.tic() train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) train_user_ids = np.unique(train['uid']) train_item_ids = np.unique(train['iid']) self.train_item_ids = train_item_ids train_user_ids_map = {user_id: i for i, user_id in enumerate(train_user_ids)} train_item_ids_map = {user_id: i for i, user_id in enumerate(train_item_ids)} _train_i_for_inf = [train_user_ids_map[_t[0]] for _t in train] _train_j_for_inf = [train_item_ids_map[_t[1]] for _t in train] self.R_train = sp.coo_matrix( (np.ones(len(_train_i_for_inf)), (_train_i_for_inf, _train_j_for_inf)), shape=[len(train_user_ids), len(train_item_ids)]).tolil(copy=False) self.train_items = self.record_list(_train_j_for_inf, _train_i_for_inf) self.train_u = _train_i_for_inf self.train_i = _train_j_for_inf with open(test_cold_iid_file) as f: test_item_ids = [int(line) for line in f] self.test_item_ids = test_item_ids test_data = pd.read_csv(test_cold_file, delimiter=",", header=None, dtype=np.int32).values.ravel() test_data = test_data.view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) test_item_ids_map = {iid: i for i, iid in enumerate(test_item_ids)} _test_ij_for_inf = [(t[0], t[1]) for t in test_data if t[1] in test_item_ids_map] # test_user_ids test_user_ids = np.unique(test_data['uid']) # test_user_ids_map # test_user_ids_map = {user_id: i for i, user_id in enumerate(test_user_ids)} test_user_ids_map = train_user_ids_map _test_i_for_inf = [test_user_ids_map[_t[0]] for _t in _test_ij_for_inf] _test_j_for_inf = [test_item_ids_map[_t[1]] for _t in _test_ij_for_inf] self.R_test = sp.coo_matrix( (np.ones(len(_test_i_for_inf)), (_test_i_for_inf, _test_j_for_inf)), shape=[len(train_user_ids), len(test_item_ids)]).tolil(copy=False) self.test_items = self.record_list(_test_j_for_inf, _test_i_for_inf) self.test_u = _test_i_for_inf self.test_users = self.record_list(_test_i_for_inf, _test_j_for_inf) item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) item_content = tfidf(item_content) from sklearn.utils.extmath import randomized_svd u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s _, item_content = utils.prep_standardize(item_content) if sp.issparse(item_content): item_feature = item_content.tolil(copy=False) else: item_feature = item_content # timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() self.item_content = item_feature self.S_tr = item_feature[train_item_ids, :] self.S_te = item_feature[test_item_ids, :] if sp.issparse(self.S_tr): self.S_tr = self.S_tr.todense() self.S_te = self.S_te.todense() self.X_tr = self.R_train.todense().T self.U_tr = self.R_train.todense() self.X_te = self.R_test.todense().T self.U_te = self.R_test.todense() self.n_users = self.X_tr.shape[0] + self.X_te.shape[0] self.n_items = self.U_tr.shape[0]