class Recommender(object): """ Contains attributes and methods that needed to train a sequential recommendation model. Models are trained by many tuples of (users, sequences, targets, negatives) and negatives are from negative sampling: for any known tuple of (user, sequence, targets), one or more items are randomly sampled to act as negatives. Parameters ---------- n_iter: int, Number of iterations to run. batch_size: int, Minibatch size. l2: float, L2 loss penalty, also known as the 'lambda' of l2 regularization. neg_samples: int, Number of negative samples to generate for each targets. If targets=3 and neg_samples=3, then it will sample 9 negatives. learning_rate: float, Initial learning rate. use_cuda: boolean, Run the model on a GPU or CPU. model_args: args, Model-related arguments, like latent dimensions. """ def __init__(self, n_iter=None, batch_size=None, l2=None, neg_samples=None, learning_rate=None, use_cuda=False, checkpoint=None, model_args=None): # model related self._num_items = None self._num_users = None self._net = None self.model_args = model_args # learning related self._batch_size = batch_size self._n_iter = n_iter self._learning_rate = learning_rate self._l2 = l2 self._neg_samples = neg_samples self._device = torch.device("cuda" if use_cuda else "cpu") self.checkpoint = checkpoint # rank evaluation related self.test_sequence = None self._candidate = dict() @property def _initialized(self): return self._net is not None def _initialize(self, interactions): self._num_items = interactions.num_items self._num_users = interactions.num_users self.test_sequence = interactions.test_sequences self._net = Caser(self._num_users, self._num_items, self.model_args) self._optimizer = optim.Adam(self._net.parameters(), weight_decay=self._l2, lr=self._learning_rate) def fit(self, train, test, verbose=False): """ The general training loop to fit the model Parameters ---------- train: :class:`interactions.Interactions` training instances, also contains test sequences test: :class:`interactions.Interactions` only contains targets for test sequences verbose: bool, optional print the logs """ # convert sequences, targets and users to numpy arrays sequences_np = train.sequences.sequences targets_np = train.sequences.targets users_np = train.sequences.user_ids.reshape(-1, 1) L, T = train.sequences.L, train.sequences.T n_train = sequences_np.shape[0] print('total training instances: %d' % n_train) if not self._initialized: self._initialize(train) start_epoch = 1 if self.checkpoint: print("loading checkpoint from %s" % self.checkpoint) checkpoint = torch.load(self.checkpoint) start_epoch = checkpoint['epoch_num'] self._net.load_state_dict(checkpoint['state_dict']) self._optimizer.load_state_dict(checkpoint['optimizer']) print("loaded checkpoint %s (epoch %d)" % (self.checkpoint, start_epoch)) # compute number of parameters print("Number of params: %d" % compute_model_size(self._net)) for epoch_num in range(start_epoch, self._n_iter + 1): t1 = time() # set model to training model and move it to the corresponding devices self._net.train() self._net = self._net.to(self._device) users_np, sequences_np, targets_np = shuffle( users_np, sequences_np, targets_np) negatives_np = self._generate_negative_samples(users_np, train, n=self._neg_samples) # convert numpy arrays to PyTorch tensors and move it to the corresponding devices users, sequences, targets, negatives = ( torch.from_numpy(users_np).long(), torch.from_numpy(sequences_np).long(), torch.from_numpy(targets_np).long(), torch.from_numpy(negatives_np).long()) users, sequences, targets, negatives = (users.to(self._device), sequences.to(self._device), targets.to(self._device), negatives.to(self._device)) epoch_loss = 0.0 for (minibatch_num, (batch_users, batch_sequences, batch_targets, batch_negatives)) in enumerate( minibatch(users, sequences, targets, negatives, batch_size=self._batch_size)): # concatenate all variables to get predictions in one run items_to_predict = torch.cat((batch_targets, batch_negatives), 1) items_prediction = self._net(batch_sequences, batch_users, items_to_predict) (targets_prediction, negatives_prediction) = torch.split( items_prediction, [batch_targets.size(1), batch_negatives.size(1)], dim=1) self._optimizer.zero_grad() # compute the binary cross-entropy loss loss = sigmoid_log_loss(targets_prediction, negatives_prediction) epoch_loss += loss.item() loss.backward() self._optimizer.step() epoch_loss /= minibatch_num + 1 t2 = time() if verbose and epoch_num % 10 == 0: precision, recall, ndcg, mean_aps = evaluate_ranking( self, test, train, k=[3, 5, 10]) str_precs = "precisions=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in precision]) str_recalls = "recalls=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in recall]) str_ndcgs = "ndcgs=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in ndcg]) output_str = "Epoch %d [%.1f s]\tloss=%.4f, " \ "map=%.4f, %s, %s, %s[%.1f s]" % (epoch_num, t2 - t1, epoch_loss, mean_aps, str_precs, str_recalls, str_ndcgs, time() - t2) print(output_str) else: output_str = "Epoch %d [%.1f s]\tloss=%.4f [%.1f s]" % ( epoch_num, t2 - t1, epoch_loss, time() - t2) print(output_str) _save_checkpoint( { 'epoch_num': epoch_num, 'state_dict': self._net.state_dict(), 'optimizer': self._optimizer.state_dict(), }, 'checkpoints/gowalla-caser-dim=%d.pth.tar' % self.model_args.d) def _generate_negative_samples(self, users, interactions, n): """ Sample negative from a candidate set of each user. The candidate set of each user is defined by: {All Items} \ {Items Rated by User} Parameters ---------- users: array of np.int64 sequence users interactions: :class:`interactions.Interactions` training instances, used for generate candidates n: int total number of negatives to sample for each sequence """ users_ = users.squeeze() negative_samples = np.zeros((users_.shape[0], n), np.int64) if not self._candidate: all_items = np.arange(interactions.num_items - 1) + 1 # 0 for padding train = interactions.tocsr() for user, row in enumerate(train): self._candidate[user] = list(set(all_items) - set(row.indices)) for i, u in enumerate(users_): for j in range(n): x = self._candidate[u] negative_samples[i, j] = x[np.random.randint(len(x))] return negative_samples def predict(self, user_id, item_ids=None, model=None): """ Make predictions for evaluation: given a user id, it will first retrieve the test sequence associated with that user and compute the recommendation scores for items. Parameters ---------- user_id: int users id for which prediction scores needed. item_ids: array, optional Array containing the item ids for which prediction scores are desired. If not supplied, predictions for all items will be computed. """ if self.test_sequence is None: raise ValueError('Missing test sequences, cannot make predictions') if model is None: model = self._net # set model to evaluation model model.eval() with torch.no_grad(): sequences_np = self.test_sequence.sequences[user_id, :] sequences_np = np.atleast_2d(sequences_np) if item_ids is None: item_ids = np.arange(self._num_items).reshape(-1, 1) sequences = torch.from_numpy( sequences_np.astype(np.int64).reshape(1, -1)) item_ids = torch.from_numpy(item_ids.astype(np.int64)) user_id = torch.from_numpy(np.array([[user_id]]).astype(np.int64)) user, sequences, items = (user_id.to(self._device), sequences.to(self._device), item_ids.to(self._device)) out = model(sequences, user, items, for_pred=True) return cpu(out.data).numpy().flatten()
class DistilledRecommender(Recommender): """ Contains attributes and methods that needed to train a sequential recommendation model with ranking distillation[1]. Models are trained by many tuples of (users, sequences, targets, negatives) and negatives are from negative sampling: for any known tuple of (user, sequence, targets), one or more items are randomly sampled to act as negatives. [1] Ranking Distillation: Learning Compact Ranking Models With High Performance for Recommender System, Jiaxi Tang and Ke Wang , KDD '18 Parameters ---------- n_iter: int, Number of iterations to run. batch_size: int, Minibatch size. l2: float, L2 loss penalty, also known as the 'lambda' of l2 regularization. neg_samples: int, Number of negative samples to generate for each targets. learning_rate: float, Initial learning rate. use_cuda: boolean, Run the model on a GPU or CPU. teacher_model_path: string, Path to teacher's model checkpoint. teacher_topk_path: string, Path to teacher's top-K ranking cache for each training instance. lamda: float Hyperparameter for tuning the sharpness of position importance weight. mu: float Hyperparameter for tuning the sharpness of ranking discrepancy weight. num_dynamic_samples: int Number of samples used for estimating student's rank. dynamic_start_epoch: int Number of iteration to start using hybrid of two different weights. K: int Length of teacher's exemplary ranking. teach_alpha: float: Weight for balancing ranking loss and distillation loss. student_model_args: args, Student model related arguments, like latent dimensions. teacher_model_args: args, Teacher model related arguments, like latent dimensions. """ def __init__(self, n_iter=None, batch_size=None, l2=None, neg_samples=None, learning_rate=None, use_cuda=False, teacher_model_path=None, teacher_topk_path=None, lamda=None, mu=None, num_dynamic_samples=None, dynamic_start_epoch=None, K=None, teach_alpha=None, student_model_args=None, teacher_model_args=None): # data related self.L = None self.T = None # model related self._num_items = None self._num_users = None self._teacher_net = None # teacher model self._student_net = None # student model self._student_model_args = student_model_args self._teacher_model_args = teacher_model_args # learning related self._batch_size = batch_size self._n_iter = n_iter self._learning_rate = learning_rate self._l2 = l2 self._neg_samples = neg_samples self._device = torch.device("cuda" if use_cuda else "cpu") # ranking distillation related self._teach_alpha = teach_alpha self._lambda = lamda self._mu = mu self._num_dynamic_samples = num_dynamic_samples self._dynamic_start_epoch = dynamic_start_epoch self._K = K self._teacher_model_path = teacher_model_path self._teacher_topk_path = teacher_topk_path self._weight_renormalize = False # rank evaluation related self.test_sequence = None self._candidate = dict() @property def _teacher_initialized(self): return self._teacher_net is not None def _initialize_teacher(self, interactions): # initialize teacher model self._num_items = interactions.num_items self._num_users = interactions.num_users self._teacher_net = Caser(self._num_users, self._num_items, self._teacher_model_args) # load teacher model if os.path.isfile(self._teacher_model_path): output_str = ("loading teacher model from %s" % self._teacher_model_path) print(output_str) checkpoint = torch.load(self._teacher_model_path) self._teacher_net.load_state_dict(checkpoint['state_dict']) output_str = "loaded model %s (epoch %d)" % ( self._teacher_model_path, checkpoint['epoch_num']) print(output_str) else: output_str = "no model found at %s" % self._teacher_model_path print output_str # set teacher model to evaluation mode self._teacher_net.eval() @property def _student_initialized(self): return self._student_net is not None def _initialize_student(self, interactions): self._num_items = interactions.num_items self._num_users = interactions.num_users self.test_sequence = interactions.test_sequences self._student_net = Caser(self._num_users, self._num_items, self._student_model_args) self._optimizer = optim.Adam(self._student_net.parameters(), weight_decay=self._l2, lr=self._learning_rate) def fit(self, train, test, verbose=False): """ The general training loop to fit the model Parameters ---------- train: :class:`interactions.Interactions` training instances, also contains test sequences test: :class:`interactions.Interactions` only contains targets for test sequences verbose: bool, optional print the logs """ # convert sequences, targets and users to numpy arrays sequences_np = train.sequences.sequences targets_np = train.sequences.targets users_np = train.sequences.user_ids.reshape(-1, 1) self.L, self.T = train.sequences.L, train.sequences.T n_train = sequences_np.shape[0] output_str = 'total training instances: %d' % n_train print(output_str) if not self._teacher_initialized: self._initialize_teacher(train) if not self._student_initialized: self._initialize_student(train) # here we compute teacher top-K ranking for each training instance in advance for faster training speed # while we have to compute the top-K ranking on the fly if it is too large to keep in memory if os.path.isfile(self._teacher_topk_path): print('found teacher topk file, loading..') teacher_ranking = np.load(self._teacher_topk_path) else: print('teacher topk file not found, generating.. ') teacher_ranking = self._get_teacher_topk(sequences_np, users_np, targets_np, k=self._K) # initialize static weight (position importance weight) weight_static = np.array(range(1, self._K + 1), dtype=np.float32) weight_static = np.exp(-weight_static / self._lambda) weight_static = weight_static / np.sum(weight_static) weight_static = torch.from_numpy(weight_static).to(self._device) weight_static = weight_static.unsqueeze(0) # initialize dynamic weight (ranking discrepancy weight) weight_dynamic = None # count number of parameters print("Number of params in teacher model: %d" % compute_model_size(self._teacher_net)) print("Number of params in student model: %d" % compute_model_size(self._student_net)) indices = np.arange(n_train) start_epoch = 1 for epoch_num in range(start_epoch, self._n_iter + 1): t1 = time() # set teacher model to evaluation mode and move it to the corresponding devices self._teacher_net.eval() self._teacher_net = self._teacher_net.to(self._device) # set student model to training mode and move it to the corresponding devices self._student_net.train() self._student_net = self._student_net.to(self._device) (users_np, sequences_np, targets_np), shuffle_indices = shuffle(users_np, sequences_np, targets_np, indices=True) indices = indices[ shuffle_indices] # keep indices for retrieval teacher's top-K ranking from cache negatives_np = self._generate_negative_samples(users_np, train, n=self._neg_samples) dynamic_samples_np = self._generate_negative_samples( users_np, train, n=self._num_dynamic_samples) # convert numpy arrays to PyTorch tensors and move it to the corresponding devices users, sequences, targets, negatives = ( torch.from_numpy(users_np).long(), torch.from_numpy(sequences_np).long(), torch.from_numpy(targets_np).long(), torch.from_numpy(negatives_np).long()) users, sequences, targets, negatives = (users.to(self._device), sequences.to(self._device), targets.to(self._device), negatives.to(self._device)) dynamic_samples = torch.from_numpy(dynamic_samples_np).long().to( self._device) epoch_loss = 0.0 epoch_regular_loss = 0.0 for (minibatch_num, (batch_indices, batch_users, batch_sequences, batch_targets, batch_negatives, batch_dynamics)) in enumerate( minibatch(indices, users, sequences, targets, negatives, dynamic_samples, batch_size=self._batch_size)): # retrieval teacher top-K ranking given indices batch_candidates = torch.from_numpy( teacher_ranking[batch_indices, :]).long().to(self._device) # concatenate all variables to get predictions in one run items_to_predict = torch.cat( (batch_targets, batch_negatives, batch_candidates, batch_dynamics), 1) items_prediction = self._student_net(batch_sequences, batch_users, items_to_predict) (targets_prediction, negatives_prediction, candidates_prediction, dynamics_prediction) = torch.split(items_prediction, [ batch_targets.size(1), batch_negatives.size(1), batch_candidates.size(1), batch_dynamics.size(1) ], dim=1) self._optimizer.zero_grad() if epoch_num > self._dynamic_start_epoch: # compute dynamic weight dynamic_weights = list() for col in range(self._K): col_prediction = candidates_prediction[:, col].unsqueeze( 1) num_smaller_than = torch.sum( col_prediction < dynamics_prediction, dim=1).float() relative_rank = num_smaller_than / self._num_dynamic_samples predicted_rank = torch.floor(self._num_items * relative_rank) dynamic_weight = torch.tanh(self._mu * (predicted_rank - col)) dynamic_weight = torch.clamp(dynamic_weight, min=0.0) dynamic_weights.append(dynamic_weight) weight_dynamic = torch.stack(dynamic_weights, 1) # hybrid two weights weight = weight_dynamic * weight_static if self._weight_renormalize: weight = F.normalize(weight, p=1, dim=1) else: weight = weight_static # detach the weight to stop the gradient flow to the weight weight = weight.detach() loss, regular_loss = weighted_sigmoid_log_loss( targets_prediction, negatives_prediction, candidates_prediction, weight, self._teach_alpha) epoch_loss += loss.item() epoch_regular_loss += regular_loss.item() loss.backward() # assert False self._optimizer.step() epoch_loss /= minibatch_num + 1 epoch_regular_loss /= minibatch_num + 1 t2 = time() if verbose and epoch_num % 10 == 0: precision, recall, ndcg, mean_aps = evaluate_ranking( self, test, train, k=[3, 5, 10]) str_precs = "precisions=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in precision]) str_recalls = "recalls=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in recall]) str_ndcgs = "ndcgs=%.4f,%.4f,%.4f" % tuple( [np.mean(a) for a in ndcg]) output_str = "Epoch %d [%.1f s]\tloss=%.4f, regular_loss=%.4f, " \ "map=%.4f, %s, %s, %s[%.1f s]" % (epoch_num, t2 - t1, epoch_loss, epoch_regular_loss, mean_aps, str_precs, str_recalls, str_ndcgs, time() - t2) print(output_str) else: output_str = "Epoch %d [%.1f s]\tloss=%.4f, regular_loss=%.4f[%.1f s]" % ( epoch_num, t2 - t1, epoch_loss, epoch_regular_loss, time() - t2) print(output_str) def _get_teacher_topk(self, sequences, users, targets, k): """ Pre-compute and cache teacher's top-K ranking for each training instance. By doing this we can make training with distillation much faster. Parameters ---------- sequences: array of np.int64 sequencces of items users: array of np.int64 users associated with each sequence targets: array of np.int64 target item that user interact with given the sequence k: int length of teacher's exemplary ranking """ with_targets = False n_train = sequences.shape[0] indices = np.arange(n_train) users, sequences = torch.from_numpy(users).long(), torch.from_numpy( sequences).long() # teacher topk results teacher_topk = np.zeros((n_train, k), dtype=np.int64) for (batch_indices, batch_users, batch_sequences, batch_targets) in minibatch(indices, users, sequences, targets, batch_size=16): cur_batch_size = batch_users.shape[0] all_items = torch.arange(start=0, end=self._num_items).repeat( cur_batch_size, 1).long() teacher_prediction = self._teacher_net(batch_sequences, batch_users, all_items).detach() _, tops = teacher_prediction.topk( k * 2, dim=1) # return the topk by column tops = tops.cpu().numpy() new_tops = np.concatenate((batch_targets, tops), axis=1) topks = np.zeros((cur_batch_size, k), dtype=np.int64) for i, row in enumerate(new_tops): _, idx = np.unique(row, return_index=True) # whether teacher's top-k ranking consider target items if with_targets: topk = row[np.sort(idx)][:k] else: topk = row[np.sort(idx)][self.T:k + self.T] topks[i, :] = topk teacher_topk[batch_indices, :] = topks np.save( 'gowalla-teacher-dim=%d-top=%d.npy' % (self._teacher_model_args.d, k), teacher_topk) return teacher_topk def predict(self, user_id, item_ids=None, model=None): return super(DistilledRecommender, self).predict(user_id, item_ids, model=self._student_net)