def __init__(self, dataset: BasicDataset, student: PairWiseModel, teacher: PairWiseModel, dns_k: int, method: int = 3, beta=world.beta): """ method 1 for convex combination method 2 for random indicator method 3 for simplified method 2 """ self.beta = beta self.W = torch.Tensor([world.p0]) self.dataset = dataset self.student = student self.teacher = teacher # self.methods = { # 'combine' : self.convex_combine, # not yet # 'indicator' : self.random_indicator, # 'simple' : self.max_min, # 'weight' : self.weight_pair, # } self.method = 'combine' self.Sample = self.convex_combine cprint(f"Using {self.method}") self.dns_k = dns_k self.soft = Softmax(dim=1)
def __init_weight(self): self.num_users = self.dataset.n_users self.num_items = self.dataset.m_items self.latent_dim = self.config['latent_dim_rec'] self.n_layers = self.config['lightGCN_n_layers'] self.keep_prob = self.config['keep_prob'] self.A_split = self.config['A_split'] self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim) self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim) if self.config['pretrain'] == 0: # nn.init.xavier_uniform_(self.embedding_user.weight, gain=1) # nn.init.xavier_uniform_(self.embedding_item.weight, gain=1) # print('use xavier initilizer') # random normal init seems to be a better choice when lightGCN actually don't use any non-linear activation function nn.init.normal_(self.embedding_user.weight, std=0.1) nn.init.normal_(self.embedding_item.weight, std=0.1) world.cprint('use NORMAL distribution initilizer') else: self.embedding_user.weight.data.copy_( torch.from_numpy(self.config['user_emb'])) self.embedding_item.weight.data.copy_( torch.from_numpy(self.config['item_emb'])) print('use pretarined data') self.f = nn.Sigmoid() self.Graph = self.dataset.getSparseGraph() print(f"lgn is already to go(dropout:{self.config['dropout']})")
def __init_weight(self): self.map_table = self.dataset.map_table self.num_users = self.dataset.n_users self.num_all_items = self.dataset.n_all_item self.latent_dim = self.config['latent_dim_rec'] self.n_layers = self.config['GCN_n_layers'] self.keep_prob = self.config['keep_prob'] self.A_split = self.config['A_split'] self.aspect = self.config['aspect'] self.embedding_user = nn.ParameterList( nn.Parameter(torch.randn(self.num_users, self.latent_dim)) for i in range(len(self.aspect))) self.embedding_item = nn.ParameterList() for i in range(len(self.aspect)): self.embedding_item.append( nn.Parameter( torch.randn(self.num_all_items[i], self.latent_dim))) if self.config['pretrain'] == 0: # nn.init.xavier_uniform_(self.embedding_dict[''].weight, gain=0.1) # print('use xavier initializer') for i in range(len(self.aspect)): nn.init.normal_(self.embedding_user[i], std=0.1) nn.init.normal_(self.embedding_item[i], std=0.1) world.cprint('use NORMAL distribution initializer') else: # not implemented print('use pre-trained data') self.f = nn.Sigmoid() self.Graph = self.dataset.getSparseGraph() # adjacency matrix print(f"multi-lgn is already to go(dropout:{self.config['dropout']}")
def __init__(self, config=world.config, path="../data/TaoBao"): super(Amazon, self).__init__() # train of test cprint(f"loading [{path}]") self.mode_dict = {'train': 0, 'test': 1} self.mode = self.mode_dict['train'] self.aspect = config['aspect'] self.split = config['A_split'] self.folds = config['A_n_fold'] self.path = path self.map_table = pd.read_csv(join(path + '/map-table.csv'), sep=',', header=0) self.n_user = 13201 self.n_all_item = [14094, 2771, 15] train_data = pd.read_csv(join(path + '/train.txt'), sep=' ', header=None) test_data = pd.read_csv(join(path + '/test.txt'), sep=' ', header=None) self.trainData = train_data self.testData = test_data self.trainUser = np.array(train_data[:][0]) # train user list self.trainUniqueUsers = np.unique(self.trainUser) self.trainAllItem = [] # train item of multi aspect info for i in range(len(self.aspect)): self.trainAllItem.append(np.array(train_data[:][i + 1])) self.testUser = np.array(test_data[:][0]) # test user list self.testUniqueUsers = np.unique(self.testUser) self.testItem = np.array(test_data[:][1]) # test item list self.Graph = None print(f"{self.trainDataSize} interactions for training") print(f"{self.testDataSize} interactions for testing") print( f"{world.dataset} Sparsity: {(self.trainDataSize + self.testDataSize) / self.n_users / self.m_items}" ) # bipartite graph self.InteractNet = [] self.users_D = [] self.all_items_D = [] for i in range(len(self.aspect)): self.InteractNet.append( csr_matrix((np.ones(len(self.trainUser)), (self.trainUser, self.trainAllItem[i])))) self.users_D.append( np.array(self.InteractNet[i].sum(axis=1)).squeeze()) self.users_D[i][self.users_D[i] == 0.] = 1 self.all_items_D.append( np.array(self.InteractNet[i].sum(axis=0)).squeeze()) self.all_items_D[i][self.all_items_D[i] == 0.] = 1 # pre-calculate self._allPos = self.getUserPosItems(list(range(self.n_users))) self.__testDict = self.__build_test()
def __init__(self, path="../data/lastfm"): # train or test cprint("loading [last fm]") self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] # self.n_users = 1892 # self.m_items = 4489 trainData = pd.read_table(join(path, 'data1.txt'), header=None) # print(trainData.head()) testData = pd.read_table(join(path, 'test1.txt'), header=None) # print(testData.head()) trustNet = pd.read_table(join(path, 'trustnetwork.txt'), header=None).to_numpy() # print(trustNet[:5]) trustNet -= 1 trainData -= 1 testData -= 1 self.trustNet = trustNet self.trainData = trainData self.testData = testData self.trainUser = np.array(trainData[:][0]) self.trainUniqueUsers = np.unique(self.trainUser) self.trainItem = np.array(trainData[:][1]) # self.trainDataSize = len(self.trainUser) self.testUser = np.array(testData[:][0]) self.testUniqueUsers = np.unique(self.testUser) self.testItem = np.array(testData[:][1]) self.Graph = None print( f"LastFm Sparsity : {(len(self.trainUser) + len(self.testUser))/self.n_users/self.m_items}" ) # (users,users) self.socialNet = csr_matrix( (np.ones(len(trustNet)), (trustNet[:, 0], trustNet[:, 1])), shape=(self.n_users, self.n_users)) # (users,items), bipartite graph self.UserItemNet = csr_matrix( (np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.n_users, self.m_items)) # pre-calculate self._allPos = self.getUserPosItems(list(range(self.n_users))) self.allNeg = [] allItems = set(range(self.m_items)) for i in range(self.n_users): pos = set(self._allPos[i]) neg = allItems - pos self.allNeg.append(np.array(list(neg))) self.__testDict = self.__build_test()
def __init_weight(self): self.num_users = self.dataset.n_users self.num_items = self.dataset.m_items self.latent_dim = self.config['latent_dim_rec'] self.mini_latent_dim = self.config['mini_latent_dim_rec'] self.n_layers = self.config['lightGCN_n_layers'] self.keep_prob = self.config['keep_prob'] self.A_split = self.config['A_split'] # Full model params self.embedding_user = torch.nn.Embedding( num_embeddings=self.num_users, embedding_dim=self.latent_dim) self.embedding_item = torch.nn.Embedding( num_embeddings=self.num_items, embedding_dim=self.latent_dim) self.bias_user = nn.Embedding(self.num_users, 1) self.bias_item = nn.Embedding(self.num_items, 1) self.bias_user.weight.data.fill_(0.) self.bias_item.weight.data.fill_(0.) # Mini model params self.mm_embedding_user = torch.nn.Embedding( num_embeddings=self.num_users, embedding_dim=self.mini_latent_dim) self.mm_embedding_item = torch.nn.Embedding( num_embeddings=self.num_items, embedding_dim=self.mini_latent_dim) # proj mini embeddings to the full embeddings space self.proj = nn.Upsample(size=self.latent_dim) nn.init.normal_(self.embedding_user.weight, std=0.1) nn.init.normal_(self.embedding_item.weight, std=0.1) nn.init.normal_(self.mm_embedding_user.weight, std=0.1) nn.init.normal_(self.mm_embedding_item.weight, std=0.1) world.cprint('use NORMAL distribution initilizer') # Load state_dict for just the full model: self.embedding_user.weight.data.copy_(self.checkpt['embedding_user.weight']) self.embedding_item.weight.data.copy_(self.checkpt['embedding_item.weight']) self.bias_user.weight.data.copy_(self.checkpt['bias_user.weight']) self.bias_item.weight.data.copy_(self.checkpt['bias_item.weight']) self.f = nn.Sigmoid() self.Graph = self.dataset.getSparseGraph() print(f"lgn is already to go(dropout:{self.config['dropout']})")
def __init__(self, path="../data/skills-predictor"): self.path = path # train or test print(self.m_items) cprint("loading [last fm]") self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] trainData = pd.read_csv(join(path, 'all_data.csv')) # print(trainData.head()) testData = pd.read_csv(join(path, 'test.csv')) users, skills, rates = trainData["user"], trainData[ "skill"], trainData["rate"] self.UserItemNet = csr_matrix( (rates, (users, skills)), dtype="float32", shape=(self.n_users, self.m_items), ) # self.trustNet = trustNet self.trainData = trainData self.testData = testData self.trainUser = np.array(trainData['user']) self.trainUniqueUsers = np.unique(self.trainUser) self.trainItem = np.array(trainData['skill']) # self.trainDataSize = len(self.trainUser) self.testUser = np.array(testData['user']) self.testUniqueUsers = np.unique(self.testUser) self.testItem = np.array(testData['skill']) self.Graph = None print( f"LastFm Sparsity : {(len(self.trainUser) + len(self.testUser))/self.n_users/self.m_items}" ) # pre-calculate self._allPos = self.getUserPosItems(list(range(self.n_users))) self.allNeg = [] allItems = set(range(self.m_items)) for i in range(self.n_users): pos = set(self._allPos[i]) neg = allItems - pos self.allNeg.append(np.array(list(neg))) self.__testDict = self.__build_test()
def __init_weight(self): self.num_users = self.dataset.n_users self.num_items = self.dataset.m_items self.latent_dim = self.config['latent_dim_rec'] self.n_layers = self.config['lightGCN_n_layers'] self.keep_prob = self.config['keep_prob'] self.A_split = self.config['A_split'] self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim) self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim) if self.config['pretrain'] == 0: nn.init.normal_(self.embedding_user.weight, std=0.1) nn.init.normal_(self.embedding_item.weight, std=0.1) world.cprint('use NORMAL distribution initilizer') else: self.embedding_user.weight.data.copy_( torch.from_numpy(self.config['user_emb'])) self.embedding_item.weight.data.copy_( torch.from_numpy(self.config['item_emb'])) print('use pretarined data') self.f = nn.Sigmoid() self.Graph = self.dataset.getSparseGraph() print(f"lgn is already to go(dropout:{self.config['dropout']})")
# ---------------------------------------------------------------------------- # init model world.DISTILL = False if len(world.comment) == 0: comment = f"{world.method}" if world.EMBEDDING: comment = comment + "-embed" world.comment = comment import register from register import dataset if world.EMBEDDING: # embedding distillation print("distill") tea_config = utils.getTeacherConfig(world.config) world.cprint('teacher') teacher_model = register.MODELS[world.model_name](tea_config, dataset, fix=True) teacher_model.eval() teacher_file = utils.getFileName(world.model_name, world.dataset, world.config['teacher_dim'], layers=world.config['teacher_layer']) teacher_weight_file = os.path.join(world.FILE_PATH, teacher_file) print('-------------------------') world.cprint("loaded teacher weights from") print(teacher_weight_file) print('-------------------------') utils.load(teacher_model, teacher_weight_file) teacher_model = teacher_model.to(world.DEVICE)
def __init__(self, config=world.config, path="../data/gowalla"): # train or test cprint(f'loading [{path}]') self.split = config['A_split'] self.folds = config['A_n_fold'] self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] self.n_user = 0 self.m_item = 0 train_file = path + '/train.txt' test_file = path + '/test.txt' self.path = path trainUniqueUsers, trainItem, trainUser = [], [], [] testUniqueUsers, testItem, testUser = [], [], [] self.traindataSize = 0 self.testDataSize = 0 with open(train_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) trainUniqueUsers.append(uid) trainUser.extend([uid] * len(items)) trainItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.traindataSize += len(items) self.trainUniqueUsers = np.array(trainUniqueUsers) self.trainUser = np.array(trainUser) self.trainItem = np.array(trainItem) with open(test_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) testUniqueUsers.append(uid) testUser.extend([uid] * len(items)) testItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.testDataSize += len(items) self.m_item += 1 self.n_user += 1 self.testUniqueUsers = np.array(testUniqueUsers) self.testUser = np.array(testUser) self.testItem = np.array(testItem) self.Graph = None print(f"{self.trainDataSize} interactions for training") print(f"{self.testDataSize} interactions for testing") print( f"{world.dataset} Sparsity : {(self.trainDataSize + self.testDataSize) / self.n_users / self.m_items}" ) # (users,items), bipartite graph self.UserItemNet = csr_matrix( (np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.n_user, self.m_item)) self.users_D = np.array(self.UserItemNet.sum(axis=1)).squeeze() self.users_D[self.users_D == 0.] = 1 self.items_D = np.array(self.UserItemNet.sum(axis=0)).squeeze() self.items_D[self.items_D == 0.] = 1. # pre-calculate self._allPos = self.getUserPosItems(list(range(self.n_user))) self.__testDict = self.__build_test() print(f"{world.dataset} is ready to go")
bpr = utils.BPRLoss(Recmodel, world.config) best_result = { 'recall': np.array([0.0]), 'precision': np.array([0.0]), 'ndcg': np.array([0.0]), 'auc': np.array([0.0]) } weight_file = utils.getFileName() print(f"load and save to {weight_file}") print(Recmodel) if world.LOAD: try: Recmodel.load_state_dict( torch.load(weight_file, map_location=torch.device('cpu'))) world.cprint(f"loaded model weights from {weight_file}") except FileNotFoundError: print(f"{weight_file} not exists, start from beginning") # init tensorboard if world.tensorboard: w: SummaryWriter = SummaryWriter( join(world.BOARD_PATH, time.strftime("%m-%d-%Hh%Mm%Ss-") + "-" + world.comment)) else: w = None world.cprint("not enable tensorflowboard") try: for epoch in range(world.TRAIN_epochs): print('======================')
def __init__(self, config=world.config, path="../data/gowalla"): # train or test cprint(f'loading [{path}]') print(config) self.split = config['A_split'] self.folds = config['A_n_fold'] self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] self.n_user = 0 self.m_item = 0 train_file = path + '/train.txt' test_file = path + '/test.txt' self.path = path trainUniqueUsers, trainItem, trainUser = [], [], [] testUniqueUsers, testItem, testUser = [], [], [] self.traindataSize = 0 self.testDataSize = 0 with open(train_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) if i != '' else -1 for i in l[1:]] uid = int(l[0]) trainUniqueUsers.append(uid) trainUser.extend([uid] * len(items)) trainItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.traindataSize += len(items) self.trainUniqueUsers = np.array(trainUniqueUsers) self.trainUser = np.array(trainUser) self.trainItem = np.array(trainItem) with open(test_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) if i != '' else -1 for i in l[1:]] uid = int(l[0]) testUniqueUsers.append(uid) testUser.extend([uid] * len(items)) testItem.extend(items) self.m_item = max(self.m_item, max(items)) self.n_user = max(self.n_user, uid) self.testDataSize += len(items) self.m_item += 1 self.n_user += 1 self.testUniqueUsers = np.array(testUniqueUsers) self.testUser = np.array(testUser) self.testItem = np.array(testItem) self.UserItemNet = csr_matrix( (np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.n_user, self.m_item)) self._allPos = self.getUserPosItems(list(range(self.n_user))) ## bipartite graph, reindex item after user self.trainItem += self.n_user # self.testItem = self.n_user print(self.trainItem, self.trainItem.shape) first_sub = np.stack([self.trainUser, self.trainItem]) second_sub = np.stack([self.trainItem, self.trainUser]) self.train_edge = np.concatenate( [first_sub.reshape(-1, 1), second_sub.reshape(-1, 1)], axis=-1) self.train_edge = sorted(self.train_edge, key=lambda x: x[0])
def __init__(self, config=world.config, path="../data/movielens"): super(Movie, self).__init__() # train or test cprint(f'loading [{path}]') self.aspect = config['aspect'] self.split = config['A_split'] self.folds = config['A_n_fold'] self.mode_dict = {'train': 0, 'test': 1} self.mode = self.mode_dict['train'] self.n_user = 0 self.n_all_item = [] test_file = path + '/test.txt' train_file = path + '/train.txt' item_info_file = path + '/map-table.csv' self.path = path train_unique_users, train_a_item, train_user = [], [], [] for i in range(len(self.aspect)): train_a_item.append([]) test_unique_users, test_item, test_user = [], [], [] self.traindataSize = 0 self.testdataSize = 0 self.map_table = pd.read_csv(item_info_file, sep=',', header=0) for i in range(len(self.aspect)): aspect_i = set(list(self.map_table[self.aspect[i]].values)) self.n_all_item.append(len(aspect_i)) self.n_all_item[1] += 1 with open(train_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) train_unique_users.append(uid) train_user.extend([uid] * len(items)) train_a_item[0].extend(items) self.n_user = max(self.n_user, uid) self.traindataSize += len(items) self.trainUniqueUsers = np.array(train_unique_users) self.trainUser = np.array(train_user) for i in range(1, len(self.aspect)): train_aspect_i = list( self.map_table.iloc[train_a_item[0]][self.aspect[i]].values) train_a_item[i].extend(train_aspect_i) self.trainAllItem = [] for i in range(len(self.aspect)): self.trainAllItem.append(np.array(train_a_item[i])) # self.trainAllItem = np.array(train_a_item) with open(test_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) test_unique_users.append(uid) test_user.extend([uid] * len(items)) test_item.extend(items) self.n_user = max(self.n_user, uid) self.testdataSize += len(items) self.testUniqueUsers = np.array(test_unique_users) self.testUser = np.array(test_user) self.testItem = np.array(test_item) self.n_user += 1 self.Graph = None print(f"{self.traindataSize} interactions for training") print(f"{self.testdataSize} interactions for testing") print( f"{world.dataset} Sparsity: " f"{(self.traindataSize + self.testdataSize) / self.n_users / self.n_all_item[0]}" ) # bipartite graph self.InteractNet = [] self.users_D = [] self.all_items_D = [] for i in range(len(self.aspect)): self.InteractNet.append( csr_matrix((np.ones(len(self.trainUser)), (self.trainUser, self.trainAllItem[i])))) self.users_D.append( np.array(self.InteractNet[i].sum(axis=1)).squeeze()) self.users_D[i][self.users_D[i] == 0.] = 1 self.all_items_D.append( np.array(self.InteractNet[i].sum(axis=0)).squeeze()) self.all_items_D[i][self.all_items_D[i] == 0.] = 1 # pre-calculate self._allPos = self.getUserPosItems(list(range(self.n_user))) self.__testDict = self.__build_test() print(f"{world.dataset} is ready to go")
from time import time from utils import shapes, combinations, timer from world import cprint from model import PairWiseModel, LightGCN from dataloader import BasicDataset from torch.nn import Softmax, Sigmoid import torch.nn.functional as F try: from cppimport import imp_from_filepath from os.path import join, dirname path = join(dirname(__file__), "sources/sampling.cpp") sampling = imp_from_filepath(path) sampling.seed(world.SEED) sample_ext = True except: world.cprint("Cpp ext not loaded") sample_ext = False ALLPOS = None # ---------------------------------------------------------------------------- # distill def userAndMatrix(batch_users, batch_items, model): """cal scores between user vector and item matrix Args: batch_users (tensor): vector (batch_size) batch_items (tensor): matrix (batch_size, dim_item) model (PairWiseModel):
from dataloader import BasicDataset from time import time from model import LightGCN from model import PairWiseModel from sklearn.metrics import roc_auc_score import random import os try: from cppimport import imp_from_filepath from os.path import join, dirname path = join(dirname(__file__), "sources/sampling.cpp") sampling = imp_from_filepath(path) sampling.seed(world.seed) sample_ext = True except: world.cprint("Cpp extension not loaded") sample_ext = False import pdb class BPRLoss: def __init__(self, recmodel: PairWiseModel, config: dict): self.model = recmodel self.weight_decay = config['decay'] self.lr = config['lr'] self.opt = optim.Adam(recmodel.parameters(), lr=self.lr) def stageOne(self, users, pos, neg): t1 = time() loss, reg_loss = self.model.bpr_loss(users, pos, neg)
print(f"[SEED:{world.SEED}]") # ---------------------------------------------------------------------------- # init model import register from register import dataset # ---------------------------------------------------------------------------- # loading teacher teacher_file = utils.getFileName(world.model_name, world.dataset, world.config['teacher_dim'], layers=world.config['teacher_layer']) teacher_file = "teacher-" + teacher_file teacher_weight_file = os.path.join(world.FILE_PATH, teacher_file) print('-------------------------') world.cprint("loaded teacher weights from") print(teacher_weight_file) print('-------------------------') teacher_config = utils.getTeacherConfig(world.config) world.cprint('teacher') teacher_model = register.MODELS[world.model_name](teacher_config, dataset, fix=True) teacher_model.eval() utils.load(teacher_model, teacher_weight_file) # ---------------------------------------------------------------------------- # ---------------------------------------------------------------------------- # loading student world.cprint('student') if world.EMBEDDING:
def __init__(self, config=world.config, path="../data/gowalla"): # train or test cprint(f'loading [{path}]') self.split = config['A_split'] self.folds = config['A_n_fold'] self.mode_dict = {'train': 0, "test": 1} self.mode = self.mode_dict['train'] self.__n_users = 0 self.__m_items = 0 train_file = path + '/train.txt' valid_file = path + '/valid.txt' test_file = path + '/test.txt' self.path = path trainUniqueUsers, trainItem, trainUser = [], [], [] validUniqueUsers, validItem, validUser = [], [], [] testUniqueUsers, testItem, testUser = [], [], [] self.__trainsize = 0 self.validDataSize = 0 self.testDataSize = 0 with open(train_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) trainUniqueUsers.append(uid) trainUser.extend([uid] * len(items)) trainItem.extend(items) self.__m_items = max(self.__m_items, max(items)) self.__n_users = max(self.__n_users, uid) self.__trainsize += len(items) self.trainUniqueUsers = np.array(trainUniqueUsers) self.trainUser = np.array(trainUser) self.trainItem = np.array(trainItem) with open(valid_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') items = [int(i) for i in l[1:]] uid = int(l[0]) validUniqueUsers.append(uid) validUser.extend([uid] * len(items)) validItem.extend(items) self.__m_items = max(self.__m_items, max(items)) self.__n_users = max(self.__n_users, uid) self.validDataSize += len(items) self.validUniqueUsers = np.array(validUniqueUsers) self.validUser = np.array(validUser) self.validItem = np.array(validItem) with open(test_file) as f: for l in f.readlines(): if len(l) > 0: l = l.strip('\n').split(' ') try: items = [int(i) for i in l[1:]] except: print("user data error", l) uid = int(l[0]) testUniqueUsers.append(uid) testUser.extend([uid] * len(items)) testItem.extend(items) self.__m_items = max(self.__m_items, max(items)) self.__n_users = max(self.__n_users, uid) self.testDataSize += len(items) self.__m_items += 1 self.__n_users += 1 self.testUniqueUsers = np.array(testUniqueUsers) self.testUser = np.array(testUser) self.testItem = np.array(testItem) # if world.ALLDATA: # self._trainUser = self.trainUser # self._trainItem = self.trainItem # self.trainUser = np.concatenate([self.trainUser, self.testUser]) # self.trainItem = np.concatenate([self.trainItem, self.testItem]) # self.__trainsize += self.testDataSize # elif world.TESTDATA: # self.__trainsize = self.testDataSize # self.trainUser = self.testUser # self.trainItem = self.testItem self.Graph = None print(f"({self.n_users} X {self.m_items})") print(f"{self.trainDataSize} interactions for training") print(f"{self.validDataSize} interactions for training") print(f"{self.testDataSize} interactions for testing") print( f"{world.dataset} Sparsity : {(self.trainDataSize + self.validDataSize + self.testDataSize) / self.n_users / self.m_items}" ) # (users,items), bipartite graph self.UserItemNet = csr_matrix( (np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.__n_users, self.__m_items), dtype='int') # pre-calculate self.__allPos = self.getUserPosItems(list(range(self.__n_users))) self.__testDict = self.build_dict(self.testUser, self.testItem) self.__validDict = self.build_dict(self.validUser, self.validItem) if world.ALLDATA: self.UserItemNet = csr_matrix( (np.ones(len(self._trainUser)), (self._trainUser, self._trainItem)), shape=(self.__n_users, self.__m_items), dtype='int') print(f"{world.dataset} is ready to go")
def __init__(self, config=world.config, path="../data/gowalla_one"): cprint(f'loading [{path}]') self.path = path self.split = False self.__n_users = 0 self.__m_items = 0 train_file = path + '/train.txt' valid_file = path + '/valid.txt' test_file = path + '/test.txt' trainUser, trainItem = [], [] validUser, validItem = [], [] testUser, testItem = [], [] with open(train_file) as f: for line in f.readlines(): user, item, _ = line.strip().split() trainUser.append(int(user)) trainItem.append(int(item)) with open(valid_file) as f: for line in f.readlines(): user, item, _ = line.strip().split() validUser.append(int(user)) validItem.append(int(item)) with open(test_file) as f: for line in f.readlines(): user, item, _ = line.strip().split() testUser.append(int(user)) testItem.append(int(item)) self.__n_users = len(testUser) self.__m_items = max(max(trainItem), max(testItem)) self.__trainsize = len(trainUser) min_index = np.min(trainUser) self.trainUser = np.array(trainUser) - min_index self.trainItem = np.array(trainItem) - min_index self.validUser = np.array(validUser) - min_index self.validItem = np.array(validItem) - min_index self.testUser = np.array(testUser) - min_index self.testItem = np.array(testItem) - min_index self.__m_items += 1 - min_index assert len(testUser) == (max(trainUser) + 1 - min_index) if world.ALLDATA: self._trainUser = self.trainUser self._trainItem = self.trainItem self.trainUser = np.concatenate([self.trainUser, self.testUser]) self.trainItem = np.concatenate([self.trainItem, self.testItem]) self.__trainsize += len(testUser) elif world.TESTDATA: self.__trainsize = len(testUser) self.trainUser = self.testUser self.trainItem = self.testItem self.Graph = None print(f"({self.n_users} X {self.m_items})") print(f"{self.trainDataSize} interactions for training") print(f"{len(testUser)} interactions for testing") print(f"{len(validUser)} interactions for validating") print( f"{world.dataset} Sparsity : {(self.trainDataSize + len(validUser) + len(testUser)) / self.n_users / self.m_items}" ) self.UserItemNet = csr_matrix( (np.ones(len(self.trainUser)), (self.trainUser, self.trainItem)), shape=(self.n_users, self.m_items)) self.users_D = np.array(self.UserItemNet.sum(axis=1)).squeeze() self.users_D[self.users_D == 0.] = 1 self.items_D = np.array(self.UserItemNet.sum(axis=0)).squeeze() self.items_D[self.items_D == 0.] = 1. # pre-calculate self.__allPos = self.getUserPosItems(list(range(self.__n_users))) self.__testDict = self.build_dict(self.testUser, self.testItem) self.__validDict = self.build_dict(self.validUser, self.validItem) if world.ALLDATA: self.UserItemNet = csr_matrix((np.ones(len(self._trainUser)), (self._trainUser, self._trainItem)), shape=(self.n_users, self.m_items)) print(f"{world.dataset} is ready to go")
Recmodel.state_dict()[key] = val # FREEZE HERE # Recmodel.embedding_user.weight.requires_grad = False # Recmodel.embedding_item.weight.requires_grad = False Neg_k = 1 # init tensorboard if world.tensorboard: w: SummaryWriter = SummaryWriter( join(world.BOARD_PATH, time.strftime("%m-%d-%Hh%Mm%Ss-") + "-" + world.comment)) else: w = None world.cprint("not enable tensorflowboard") # Store the model size in a variable: #mb_params = 1e-6*sum([param.nelement()*param.element_size() for param in Recmodel.parameters()]) try: for epoch in range(world.TRAIN_epochs): start = time.time() if epoch % 60 == 0: cprint("[TEST]") #torch.cuda.reset_max_memory_allocated() # reset max memory stats for next iter test_t0 = time.time() Procedure.Test(dataset, Recmodel, epoch, w, world.config['multicore']) test_t1 = time.time()