Ejemplo n.º 1
0
class Recommender(object):
    """
    Contains attributes and methods that needed to train a sequential
    recommendation model. Models are trained by many tuples of
    (users, sequences, targets, negatives) and negatives are from negative
    sampling: for any known tuple of (user, sequence, targets), one or more
    items are randomly sampled to act as negatives.


    Parameters
    ----------

    n_iter: int,
        Number of iterations to run.
    batch_size: int,
        Minibatch size.
    l2: float,
        L2 loss penalty, also known as the 'lambda' of l2 regularization.
    neg_samples: int,
        Number of negative samples to generate for each targets.
        If targets=3 and neg_samples=3, then it will sample 9 negatives.
    learning_rate: float,
        Initial learning rate.
    use_cuda: boolean,
        Run the model on a GPU or CPU.
    model_args: args,
        Model-related arguments, like latent dimensions.
    """
    def __init__(self,
                 n_iter=None,
                 batch_size=None,
                 l2=None,
                 neg_samples=None,
                 learning_rate=None,
                 use_cuda=False,
                 checkpoint=None,
                 model_args=None):

        # model related
        self._num_items = None
        self._num_users = None
        self._net = None
        self.model_args = model_args

        # learning related
        self._batch_size = batch_size
        self._n_iter = n_iter
        self._learning_rate = learning_rate
        self._l2 = l2
        self._neg_samples = neg_samples
        self._device = torch.device("cuda" if use_cuda else "cpu")
        self.checkpoint = checkpoint

        # rank evaluation related
        self.test_sequence = None
        self._candidate = dict()

    @property
    def _initialized(self):
        return self._net is not None

    def _initialize(self, interactions):
        self._num_items = interactions.num_items
        self._num_users = interactions.num_users

        self.test_sequence = interactions.test_sequences

        self._net = Caser(self._num_users, self._num_items, self.model_args)

        self._optimizer = optim.Adam(self._net.parameters(),
                                     weight_decay=self._l2,
                                     lr=self._learning_rate)

    def fit(self, train, test, verbose=False):
        """
        The general training loop to fit the model

        Parameters
        ----------

        train: :class:`interactions.Interactions`
            training instances, also contains test sequences
        test: :class:`interactions.Interactions`
            only contains targets for test sequences
        verbose: bool, optional
            print the logs
        """

        # convert sequences, targets and users to numpy arrays
        sequences_np = train.sequences.sequences
        targets_np = train.sequences.targets
        users_np = train.sequences.user_ids.reshape(-1, 1)

        L, T = train.sequences.L, train.sequences.T

        n_train = sequences_np.shape[0]

        print('total training instances: %d' % n_train)

        if not self._initialized:
            self._initialize(train)

        start_epoch = 1
        if self.checkpoint:
            print("loading checkpoint from %s" % self.checkpoint)
            checkpoint = torch.load(self.checkpoint)
            start_epoch = checkpoint['epoch_num']
            self._net.load_state_dict(checkpoint['state_dict'])
            self._optimizer.load_state_dict(checkpoint['optimizer'])
            print("loaded checkpoint %s (epoch %d)" %
                  (self.checkpoint, start_epoch))

        # compute number of parameters
        print("Number of params: %d" % compute_model_size(self._net))

        for epoch_num in range(start_epoch, self._n_iter + 1):

            t1 = time()

            # set model to training model and move it to the corresponding devices
            self._net.train()
            self._net = self._net.to(self._device)

            users_np, sequences_np, targets_np = shuffle(
                users_np, sequences_np, targets_np)

            negatives_np = self._generate_negative_samples(users_np,
                                                           train,
                                                           n=self._neg_samples)

            # convert numpy arrays to PyTorch tensors and move it to the corresponding devices
            users, sequences, targets, negatives = (
                torch.from_numpy(users_np).long(),
                torch.from_numpy(sequences_np).long(),
                torch.from_numpy(targets_np).long(),
                torch.from_numpy(negatives_np).long())

            users, sequences, targets, negatives = (users.to(self._device),
                                                    sequences.to(self._device),
                                                    targets.to(self._device),
                                                    negatives.to(self._device))

            epoch_loss = 0.0

            for (minibatch_num, (batch_users, batch_sequences, batch_targets,
                                 batch_negatives)) in enumerate(
                                     minibatch(users,
                                               sequences,
                                               targets,
                                               negatives,
                                               batch_size=self._batch_size)):
                # concatenate all variables to get predictions in one run
                items_to_predict = torch.cat((batch_targets, batch_negatives),
                                             1)
                items_prediction = self._net(batch_sequences, batch_users,
                                             items_to_predict)

                (targets_prediction, negatives_prediction) = torch.split(
                    items_prediction,
                    [batch_targets.size(1),
                     batch_negatives.size(1)],
                    dim=1)

                self._optimizer.zero_grad()
                # compute the binary cross-entropy loss
                loss = sigmoid_log_loss(targets_prediction,
                                        negatives_prediction)

                epoch_loss += loss.item()

                loss.backward()
                self._optimizer.step()

            epoch_loss /= minibatch_num + 1

            t2 = time()
            if verbose and epoch_num % 10 == 0:
                precision, recall, ndcg, mean_aps = evaluate_ranking(
                    self, test, train, k=[3, 5, 10])
                str_precs = "precisions=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in precision])
                str_recalls = "recalls=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in recall])
                str_ndcgs = "ndcgs=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in ndcg])

                output_str = "Epoch %d [%.1f s]\tloss=%.4f, " \
                             "map=%.4f, %s, %s, %s[%.1f s]" % (epoch_num, t2 - t1,
                                                               epoch_loss,
                                                               mean_aps, str_precs, str_recalls, str_ndcgs,
                                                               time() - t2)
                print(output_str)
            else:
                output_str = "Epoch %d [%.1f s]\tloss=%.4f [%.1f s]" % (
                    epoch_num, t2 - t1, epoch_loss, time() - t2)
                print(output_str)

        _save_checkpoint(
            {
                'epoch_num': epoch_num,
                'state_dict': self._net.state_dict(),
                'optimizer': self._optimizer.state_dict(),
            }, 'checkpoints/gowalla-caser-dim=%d.pth.tar' % self.model_args.d)

    def _generate_negative_samples(self, users, interactions, n):
        """
        Sample negative from a candidate set of each user. The
        candidate set of each user is defined by:
        {All Items} \ {Items Rated by User}

        Parameters
        ----------

        users: array of np.int64
            sequence users
        interactions: :class:`interactions.Interactions`
            training instances, used for generate candidates
        n: int
            total number of negatives to sample for each sequence
        """

        users_ = users.squeeze()
        negative_samples = np.zeros((users_.shape[0], n), np.int64)
        if not self._candidate:
            all_items = np.arange(interactions.num_items -
                                  1) + 1  # 0 for padding
            train = interactions.tocsr()
            for user, row in enumerate(train):
                self._candidate[user] = list(set(all_items) - set(row.indices))

        for i, u in enumerate(users_):
            for j in range(n):
                x = self._candidate[u]
                negative_samples[i, j] = x[np.random.randint(len(x))]

        return negative_samples

    def predict(self, user_id, item_ids=None, model=None):
        """
        Make predictions for evaluation: given a user id, it will
        first retrieve the test sequence associated with that user
        and compute the recommendation scores for items.

        Parameters
        ----------

        user_id: int
           users id for which prediction scores needed.
        item_ids: array, optional
            Array containing the item ids for which prediction scores
            are desired. If not supplied, predictions for all items
            will be computed.
        """

        if self.test_sequence is None:
            raise ValueError('Missing test sequences, cannot make predictions')
        if model is None:
            model = self._net

        # set model to evaluation model
        model.eval()
        with torch.no_grad():
            sequences_np = self.test_sequence.sequences[user_id, :]
            sequences_np = np.atleast_2d(sequences_np)

            if item_ids is None:
                item_ids = np.arange(self._num_items).reshape(-1, 1)

            sequences = torch.from_numpy(
                sequences_np.astype(np.int64).reshape(1, -1))
            item_ids = torch.from_numpy(item_ids.astype(np.int64))
            user_id = torch.from_numpy(np.array([[user_id]]).astype(np.int64))

            user, sequences, items = (user_id.to(self._device),
                                      sequences.to(self._device),
                                      item_ids.to(self._device))

            out = model(sequences, user, items, for_pred=True)

            return cpu(out.data).numpy().flatten()
Ejemplo n.º 2
0
class DistilledRecommender(Recommender):
    """
    Contains attributes and methods that needed to train a sequential
    recommendation model with ranking distillation[1]. Models are trained
    by many tuples of (users, sequences, targets, negatives) and negatives
    are from negative sampling: for any known tuple of (user, sequence, targets),
    one or more items are randomly sampled to act as negatives.

    [1] Ranking Distillation: Learning Compact Ranking Models With High
        Performance for Recommender System, Jiaxi Tang and Ke Wang , KDD '18

    Parameters
    ----------

    n_iter: int,
        Number of iterations to run.
    batch_size: int,
        Minibatch size.
    l2: float,
        L2 loss penalty, also known as the 'lambda' of l2 regularization.
    neg_samples: int,
        Number of negative samples to generate for each targets.
    learning_rate: float,
        Initial learning rate.
    use_cuda: boolean,
        Run the model on a GPU or CPU.
    teacher_model_path: string,
        Path to teacher's model checkpoint.
    teacher_topk_path: string,
        Path to teacher's top-K ranking cache for each training instance.
    lamda: float
        Hyperparameter for tuning the sharpness of position importance weight.
    mu: float
        Hyperparameter for tuning the sharpness of ranking discrepancy weight.
    num_dynamic_samples: int
        Number of samples used for estimating student's rank.
    dynamic_start_epoch: int
        Number of iteration to start using hybrid of two different weights.
    K: int
        Length of teacher's exemplary ranking.
    teach_alpha: float:
        Weight for balancing ranking loss and distillation loss.
    student_model_args: args,
        Student model related arguments, like latent dimensions.
    teacher_model_args: args,
        Teacher model related arguments, like latent dimensions.
    """
    def __init__(self,
                 n_iter=None,
                 batch_size=None,
                 l2=None,
                 neg_samples=None,
                 learning_rate=None,
                 use_cuda=False,
                 teacher_model_path=None,
                 teacher_topk_path=None,
                 lamda=None,
                 mu=None,
                 num_dynamic_samples=None,
                 dynamic_start_epoch=None,
                 K=None,
                 teach_alpha=None,
                 student_model_args=None,
                 teacher_model_args=None):

        # data related
        self.L = None
        self.T = None

        # model related
        self._num_items = None
        self._num_users = None
        self._teacher_net = None  # teacher model
        self._student_net = None  # student model
        self._student_model_args = student_model_args
        self._teacher_model_args = teacher_model_args

        # learning related
        self._batch_size = batch_size
        self._n_iter = n_iter
        self._learning_rate = learning_rate
        self._l2 = l2
        self._neg_samples = neg_samples
        self._device = torch.device("cuda" if use_cuda else "cpu")

        # ranking distillation related
        self._teach_alpha = teach_alpha
        self._lambda = lamda
        self._mu = mu
        self._num_dynamic_samples = num_dynamic_samples
        self._dynamic_start_epoch = dynamic_start_epoch
        self._K = K
        self._teacher_model_path = teacher_model_path
        self._teacher_topk_path = teacher_topk_path
        self._weight_renormalize = False

        # rank evaluation related
        self.test_sequence = None
        self._candidate = dict()

    @property
    def _teacher_initialized(self):
        return self._teacher_net is not None

    def _initialize_teacher(self, interactions):
        # initialize teacher model
        self._num_items = interactions.num_items
        self._num_users = interactions.num_users

        self._teacher_net = Caser(self._num_users, self._num_items,
                                  self._teacher_model_args)
        # load teacher model
        if os.path.isfile(self._teacher_model_path):
            output_str = ("loading teacher model from %s" %
                          self._teacher_model_path)
            print(output_str)

            checkpoint = torch.load(self._teacher_model_path)
            self._teacher_net.load_state_dict(checkpoint['state_dict'])
            output_str = "loaded model %s (epoch %d)" % (
                self._teacher_model_path, checkpoint['epoch_num'])
            print(output_str)
        else:
            output_str = "no model found at %s" % self._teacher_model_path
            print output_str

        # set teacher model to evaluation mode
        self._teacher_net.eval()

    @property
    def _student_initialized(self):
        return self._student_net is not None

    def _initialize_student(self, interactions):
        self._num_items = interactions.num_items
        self._num_users = interactions.num_users

        self.test_sequence = interactions.test_sequences

        self._student_net = Caser(self._num_users, self._num_items,
                                  self._student_model_args)

        self._optimizer = optim.Adam(self._student_net.parameters(),
                                     weight_decay=self._l2,
                                     lr=self._learning_rate)

    def fit(self, train, test, verbose=False):
        """
        The general training loop to fit the model

        Parameters
        ----------

        train: :class:`interactions.Interactions`
            training instances, also contains test sequences
        test: :class:`interactions.Interactions`
            only contains targets for test sequences
        verbose: bool, optional
            print the logs
        """

        # convert sequences, targets and users to numpy arrays
        sequences_np = train.sequences.sequences
        targets_np = train.sequences.targets
        users_np = train.sequences.user_ids.reshape(-1, 1)

        self.L, self.T = train.sequences.L, train.sequences.T

        n_train = sequences_np.shape[0]

        output_str = 'total training instances: %d' % n_train
        print(output_str)

        if not self._teacher_initialized:
            self._initialize_teacher(train)
        if not self._student_initialized:
            self._initialize_student(train)

        # here we compute teacher top-K ranking for each training instance in advance for faster training speed
        # while we have to compute the top-K ranking on the fly if it is too large to keep in memory
        if os.path.isfile(self._teacher_topk_path):
            print('found teacher topk file, loading..')
            teacher_ranking = np.load(self._teacher_topk_path)
        else:
            print('teacher topk file not found, generating.. ')
            teacher_ranking = self._get_teacher_topk(sequences_np,
                                                     users_np,
                                                     targets_np,
                                                     k=self._K)

        # initialize static weight (position importance weight)
        weight_static = np.array(range(1, self._K + 1), dtype=np.float32)
        weight_static = np.exp(-weight_static / self._lambda)
        weight_static = weight_static / np.sum(weight_static)

        weight_static = torch.from_numpy(weight_static).to(self._device)
        weight_static = weight_static.unsqueeze(0)

        # initialize dynamic weight (ranking discrepancy weight)
        weight_dynamic = None

        # count number of parameters
        print("Number of params in teacher model: %d" %
              compute_model_size(self._teacher_net))
        print("Number of params in student model: %d" %
              compute_model_size(self._student_net))

        indices = np.arange(n_train)
        start_epoch = 1

        for epoch_num in range(start_epoch, self._n_iter + 1):

            t1 = time()
            # set teacher model to evaluation mode and move it to the corresponding devices
            self._teacher_net.eval()
            self._teacher_net = self._teacher_net.to(self._device)
            # set student model to training mode and move it to the corresponding devices
            self._student_net.train()
            self._student_net = self._student_net.to(self._device)

            (users_np, sequences_np,
             targets_np), shuffle_indices = shuffle(users_np,
                                                    sequences_np,
                                                    targets_np,
                                                    indices=True)

            indices = indices[
                shuffle_indices]  # keep indices for retrieval teacher's top-K ranking from cache

            negatives_np = self._generate_negative_samples(users_np,
                                                           train,
                                                           n=self._neg_samples)

            dynamic_samples_np = self._generate_negative_samples(
                users_np, train, n=self._num_dynamic_samples)

            # convert numpy arrays to PyTorch tensors and move it to the corresponding devices
            users, sequences, targets, negatives = (
                torch.from_numpy(users_np).long(),
                torch.from_numpy(sequences_np).long(),
                torch.from_numpy(targets_np).long(),
                torch.from_numpy(negatives_np).long())

            users, sequences, targets, negatives = (users.to(self._device),
                                                    sequences.to(self._device),
                                                    targets.to(self._device),
                                                    negatives.to(self._device))

            dynamic_samples = torch.from_numpy(dynamic_samples_np).long().to(
                self._device)

            epoch_loss = 0.0
            epoch_regular_loss = 0.0

            for (minibatch_num,
                 (batch_indices, batch_users, batch_sequences, batch_targets,
                  batch_negatives, batch_dynamics)) in enumerate(
                      minibatch(indices,
                                users,
                                sequences,
                                targets,
                                negatives,
                                dynamic_samples,
                                batch_size=self._batch_size)):

                # retrieval teacher top-K ranking given indices
                batch_candidates = torch.from_numpy(
                    teacher_ranking[batch_indices, :]).long().to(self._device)
                # concatenate all variables to get predictions in one run
                items_to_predict = torch.cat(
                    (batch_targets, batch_negatives, batch_candidates,
                     batch_dynamics), 1)

                items_prediction = self._student_net(batch_sequences,
                                                     batch_users,
                                                     items_to_predict)

                (targets_prediction, negatives_prediction,
                 candidates_prediction,
                 dynamics_prediction) = torch.split(items_prediction, [
                     batch_targets.size(1),
                     batch_negatives.size(1),
                     batch_candidates.size(1),
                     batch_dynamics.size(1)
                 ],
                                                    dim=1)

                self._optimizer.zero_grad()

                if epoch_num > self._dynamic_start_epoch:
                    # compute dynamic weight
                    dynamic_weights = list()
                    for col in range(self._K):
                        col_prediction = candidates_prediction[:,
                                                               col].unsqueeze(
                                                                   1)

                        num_smaller_than = torch.sum(
                            col_prediction < dynamics_prediction,
                            dim=1).float()
                        relative_rank = num_smaller_than / self._num_dynamic_samples
                        predicted_rank = torch.floor(self._num_items *
                                                     relative_rank)

                        dynamic_weight = torch.tanh(self._mu *
                                                    (predicted_rank - col))
                        dynamic_weight = torch.clamp(dynamic_weight, min=0.0)

                        dynamic_weights.append(dynamic_weight)
                    weight_dynamic = torch.stack(dynamic_weights, 1)

                    # hybrid two weights
                    weight = weight_dynamic * weight_static
                    if self._weight_renormalize:
                        weight = F.normalize(weight, p=1, dim=1)
                else:
                    weight = weight_static

                # detach the weight to stop the gradient flow to the weight
                weight = weight.detach()

                loss, regular_loss = weighted_sigmoid_log_loss(
                    targets_prediction, negatives_prediction,
                    candidates_prediction, weight, self._teach_alpha)

                epoch_loss += loss.item()
                epoch_regular_loss += regular_loss.item()

                loss.backward()

                # assert False
                self._optimizer.step()

            epoch_loss /= minibatch_num + 1
            epoch_regular_loss /= minibatch_num + 1

            t2 = time()

            if verbose and epoch_num % 10 == 0:
                precision, recall, ndcg, mean_aps = evaluate_ranking(
                    self, test, train, k=[3, 5, 10])

                str_precs = "precisions=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in precision])
                str_recalls = "recalls=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in recall])
                str_ndcgs = "ndcgs=%.4f,%.4f,%.4f" % tuple(
                    [np.mean(a) for a in ndcg])

                output_str = "Epoch %d [%.1f s]\tloss=%.4f, regular_loss=%.4f, " \
                             "map=%.4f, %s, %s, %s[%.1f s]" % (epoch_num, t2 - t1,
                                                               epoch_loss, epoch_regular_loss,
                                                               mean_aps, str_precs, str_recalls, str_ndcgs,
                                                               time() - t2)
                print(output_str)
            else:
                output_str = "Epoch %d [%.1f s]\tloss=%.4f, regular_loss=%.4f[%.1f s]" % (
                    epoch_num, t2 - t1, epoch_loss, epoch_regular_loss,
                    time() - t2)
                print(output_str)

    def _get_teacher_topk(self, sequences, users, targets, k):
        """
        Pre-compute and cache teacher's top-K ranking for each training instance.
        By doing this we can make training with distillation much faster.

        Parameters
        ----------

        sequences: array of np.int64
            sequencces of items
        users: array of np.int64
            users associated with each sequence
        targets: array of np.int64
            target item that user interact with given the sequence
        k: int
            length of teacher's exemplary ranking
        """
        with_targets = False

        n_train = sequences.shape[0]
        indices = np.arange(n_train)

        users, sequences = torch.from_numpy(users).long(), torch.from_numpy(
            sequences).long()

        # teacher topk results
        teacher_topk = np.zeros((n_train, k), dtype=np.int64)

        for (batch_indices, batch_users, batch_sequences,
             batch_targets) in minibatch(indices,
                                         users,
                                         sequences,
                                         targets,
                                         batch_size=16):

            cur_batch_size = batch_users.shape[0]
            all_items = torch.arange(start=0, end=self._num_items).repeat(
                cur_batch_size, 1).long()

            teacher_prediction = self._teacher_net(batch_sequences,
                                                   batch_users,
                                                   all_items).detach()

            _, tops = teacher_prediction.topk(
                k * 2, dim=1)  # return the topk by column
            tops = tops.cpu().numpy()

            new_tops = np.concatenate((batch_targets, tops), axis=1)
            topks = np.zeros((cur_batch_size, k), dtype=np.int64)

            for i, row in enumerate(new_tops):
                _, idx = np.unique(row, return_index=True)
                # whether teacher's top-k ranking consider target items
                if with_targets:
                    topk = row[np.sort(idx)][:k]
                else:
                    topk = row[np.sort(idx)][self.T:k + self.T]
                topks[i, :] = topk
            teacher_topk[batch_indices, :] = topks
        np.save(
            'gowalla-teacher-dim=%d-top=%d.npy' %
            (self._teacher_model_args.d, k), teacher_topk)
        return teacher_topk

    def predict(self, user_id, item_ids=None, model=None):
        return super(DistilledRecommender,
                     self).predict(user_id, item_ids, model=self._student_net)