def getDataLoader(data_path, batch_size=2048): # load train data data_fields = ['user_id', 'item_id', 'rating', 'timestamp'] # all data file data_df = pd.read_table(data_path, names=data_fields) if SMALL: data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False) if IMPLICT: data_df.rating = (data_df.rating >= 4).astype(np.float32) # ua_base = allData.sample(n=90570, replace=False) df_train = data_df.sample(n=int(len(data_df) * 0.8), replace=False) df_test = data_df.drop(df_train.index, axis=0) if IMPLICT: df_train=drop_df(df_train) df_test = drop_df(df_test) # get user number n_users = max(set(data_df['user_id'].values))+1 # get item number n_items = max(set(data_df['item_id'].values))+1 print("Initialize end.The user number is:%d,item number is:%d" % (n_users, n_items)) train_loader = data.DataLoader( Interactions(df_train), batch_size=batch_size, shuffle=True) test_loader = data.DataLoader( Interactions(df_test), batch_size=batch_size, shuffle=False) loaders = {'train': train_loader, 'valid': test_loader} return (n_users,n_items ), loaders
def getDataLoader(data_path, batch_size=2048): # load train data data_fields = ['user_id', 'item_id', 'rating', 'timestamp'] # all data file data_df = pd.read_table(data_path + '', names=data_fields) # data_df=pd.read_csv(data_path+'user_ratedmovies.dat',sep='\t') # # embed() # data_df=data_df[['userID','movieID','rating']] # data_df.columns=['user_id', 'item_id', 'rating'] if SMALL: data_df = data_df.sample(n=int(len(data_df) * 0.1), replace=False) if IMPLICT: data_df.rating = (data_df.rating >= 5).astype(np.float32) for i in ['user_id', 'item_id']: data_df[i] = data_df[i].map( dict(zip(data_df[i].unique(), range(1, data_df[i].nunique() + 1)))) # print(data_df.describe()) df_train = data_df.sample(n=int(len(data_df) * 0.9), replace=False) df_test = data_df.drop(df_train.index, axis=0) if IMPLICT: df_train = drop_df(df_train) df_test = drop_df(df_test) # get user number n_users = max(data_df['user_id'].values) # get item number n_items = max(data_df['item_id'].values) print("Initialize end.The user number is:%d,item number is:%d" % (n_users, n_items)) train_loader = torch.utils.data.DataLoader(Interactions( df_train, index_from_one=True), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(Interactions( df_test, index_from_one=True), batch_size=batch_size, shuffle=False) loaders = {'train': train_loader, 'valid': test_loader} from scipy.sparse import coo_matrix data = np.ones((data_df.shape[0])) row = data_df.user_id - 1 col = data_df.item_id - 1 UI = coo_matrix((data, (row, col)), shape=(n_users, n_items)) UIU = UI.dot(UI.transpose()) IUI = UI.transpose().dot(UI) hinSim = {} hinSimI = {} calSim('UIU', UIU, hinSim, hinSimI) calSim('IUI', IUI, hinSim, hinSimI) calSim('UI', UI, hinSim, hinSimI) metaPath = {'II': ['IUI'], 'UU': ['UIU'], 'UI': 'UI'} # 也有可能是 IUI return (n_users, n_items, hinSim, hinSimI, metaPath), loaders
def getDataLoader(data_path, trainset_rate=0.8, batch_size=4096): # load train data # all data file if os.path.exists(data_path + 'data_df.txt') and os.path.exists(data_path + 'social_df.txt'): print("Epinions dataset has been preprocessed.") else: print("Epinions dataset has not been preprocessed.") epinionsPreprocessing(data_path) data_df = pd.read_table(data_path + 'data_df.txt', sep=' ') data_df['rating'] /= max(data_df['rating']) social_df = pd.read_table(data_path + 'social_df.txt', sep=' ') le = preprocessing.LabelEncoder() le.fit(data_df['user_id']) data_df['user_id'] = le.transform(data_df['user_id']) social_df['user_id'] = le.transform(social_df['user_id']) social_df['user_id2'] = le.transform(social_df['user_id2']) le.fit(data_df['item_id']) data_df['item_id'] = le.transform(data_df['item_id']) social_user = defaultdict(set) for (user, user2, record) in social_df.itertuples(index=False): social_user.setdefault(user, set()) social_user[user].add(user2) df_train = data_df.sample(n=int(len(data_df) * trainset_rate), replace=False) df_test = data_df.drop(df_train.index, axis=0) # get user number # get item number n_users = max(set(data_df['user_id'].values)) + 1 n_items = max(set(data_df['item_id'].values)) + 1 # 为何使用max不是len print("Initialize end. The user number is: %d, item number is: %d" % (n_users, n_items)) train_loader = data.DataLoader(Interactions(df_train), batch_size=batch_size, shuffle=True) test_loader = data.DataLoader(Interactions(df_test), batch_size=batch_size, shuffle=False) loaders = {'train': train_loader, 'valid': test_loader} return (n_users, n_items, social_user), loaders
def loadData(self, data_path, batch_size=2048): # load train data data_fields = ['user_id', 'item_id', 'rating', 'timestamp'] # all data file data_df = pd.read_table(data_path, names=data_fields) data_df.rating = (data_df.rating >= 5).astype(np.float32) le = preprocessing.LabelEncoder() le.fit(data_df['user_id']) data_df['user_id'] = le.transform(data_df['user_id']) le.fit(data_df['item_id']) data_df['item_id'] = le.transform(data_df['item_id']) self.n_users = max(data_df['user_id'].values) + 1 # get item number self.n_items = max(data_df['item_id'].values) + 1 print("Initialize end.The user number is:%d,item number is:%d" % (self.n_users, self.n_items)) df = {} df['train'] = data_df.sample(n=int(len(data_df) * 0.8), replace=False) df['valid'] = data_df.drop(df['train'].index, axis=0) self.user_item = defaultdict(dict) for (user, item, record, timestamp) in df['train'].itertuples(index=False): self.user_item.setdefault(user, {}) self.user_item[user][item] = record # print("ui:",user," ",item,"=",record) # print(self.user_item[500]) self.loader = {} for phase in ['train', 'valid']: self.loader[phase] = data.DataLoader(Interactions(df[phase]), batch_size=batch_size, shuffle=(phase == 'train')) print("Initialize end.The user number is:%d,item number is:%d" % (self.n_users, self.n_items)) self.loader['valid_simple'] = data.DataLoader(Interactions( df['valid']), batch_size=batch_size, shuffle=False)