Exemple #1
0
 def _init_model(self):
     if self._model is not None:
         if isinstance(self._model, str):
             model_dict = self._load
             sparse = model_dict["sparse"]
             n_factors = model_dict["n_factors"]
             n_items = model_dict["n_items"]
             n_users = model_dict["n_users"]
             self._model = SimpleMatrixFactorizationModel(
                 n_users,
                 n_items,
                 n_factors,
                 sparse)
             self._model.load_state_dict(model_dict['state_dict'])
         elif not (issubclass(type(self._model), torch.nn.Module) or
                   isinstance(self._model, torch.nn.DataParallel)):
             raise ValueError("Model must be an instance "
                              "of FactorizationMachine")
     else:
         self._model = SimpleMatrixFactorizationModel(self.n_users,
                                                      self.n_items,
                                                      self.n_factors,
                                                      self._sparse)
     if not isinstance(self._model, torch.nn.DataParallel):
         if self.use_cuda and torch.cuda.device_count() > 1 and \
                 self._device_id is None:
             self._model = torch.nn.DataParallel(gpu(self._model,
                                                     self.use_cuda))
         else:
             self._model = gpu(self._model,
                               self.use_cuda,
                               self._device_id)
    def _estimate_variance(self, x):
        train = index_dataset(x, self._user_index, self._item_index)
        x = self._model.x
        y = self._model.y

        self._var = np.zeros((self.n_users, self._n_factors), dtype=np.float32)

        for i, (u, g) in tqdm(enumerate(pd.DataFrame(train).groupby(0)),
                              desc="Var. Estimate",
                              leave=False,
                              disable=not self._verbose):
            user_profile = g.values[:, 1].astype(np.int64)
            upl = user_profile.shape[0]
            user_idx = Variable(
                gpu(torch.from_numpy(np.array([u])), self._use_cuda,
                    self._device_id))
            item_idx = Variable(
                gpu(torch.from_numpy(user_profile), self._use_cuda,
                    self._device_id))
            diff = x(user_idx) - y(item_idx)

            prod = torch.pow(diff, 2).sum(0)

            var = torch.div(prod, upl)
            self._var[i, :] = var.cpu().data.numpy()
    def _compute_delta_f(self, x, y, k, b, var, rank, users):
        # Initialize Variables
        # and other coefficients
        u_idx = Variable(
            gpu(torch.from_numpy(users), self._use_cuda, self._device_id))
        i_idx = Variable(
            gpu(torch.from_numpy(rank), self._use_cuda, self._device_id))

        wk = 1 / (2**k)
        wm = Variable(gpu(torch.from_numpy(
            np.array([1 / (2 ** m) for m in range(k)],
                     dtype=np.float32)),
            self._use_cuda,
            self._device_id)) \
            .unsqueeze(1).expand(k, self._n_factors)

        i_ranked = (y(i_idx[:, :k]) * wm).transpose(0, 1).unsqueeze(0)
        i_unranked = y(i_idx[:, k:]).transpose(0, 1)

        term0 = self._first_term(i_unranked, u_idx, x)

        term1 = self._second_term(b, i_unranked, u_idx, var, wk)

        term2 = self._third_term(b, k, u_idx, var, i_ranked, i_unranked,
                                 len(users), rank.shape[1])

        delta_f = torch.mul(term0 - term1 - term2, wk) \
            .sum(2).transpose(0, 1)

        return delta_f.cpu().data.numpy()
 def _init_model(self):
     if self._model is not None:
         if isinstance(self._model, str):
             # map the tensor to cpu
             model_dict = self._load
             n_features = model_dict["n_features"]
             n_factors = model_dict["n_factors"]
             self._model = FactorizationMachine(n_features,
                                                n_factors=n_factors)
             dic = {}
             for m in model_dict['state_dict']:
                 if m.startswith('module.'):
                     dic[m[7:]] = model_dict['state_dict'][m]
                 else:
                     dic[m] = model_dict['state_dict'][m]
             self._model.load_state_dict(dic)
         elif not (isinstance(self._model, FactorizationMachine) or
                   isinstance(self._model, torch.nn.DataParallel)):
             raise ValueError("Model must be an instance "
                              "of FactorizationMachine")
     else:
         self._model = FactorizationMachine(self.n_features,
                                            n_factors=self.n_factors)
     if not isinstance(self._model, torch.nn.DataParallel):
         if self.use_cuda and torch.cuda.device_count() > 1 and \
                 self._device_id is None:
             self._model = torch.nn.DataParallel(gpu(self._model,
                                                     self.use_cuda))
         else:
             self._model = gpu(self._model,
                               self.use_cuda,
                               self._device_id)
Exemple #5
0
    def _init_model(self):

        if self._model is not None:
            if isinstance(self._model, FactorizationMachine):
                self._model = gpu(self._model, self._use_cuda)
            else:
                raise ValueError("Model must be an instance of FactorizationMachine")

        else:
            self._model = gpu(FactorizationMachine(self._n_features,
                                                   self._n_factors),
                              self._use_cuda)
Exemple #6
0
    def fit(self, x, y, dic=None):
        """
        Fit the models.
        When called repeatedly, models fitting will resume from
        the point at which training stopped in the previous fit
        call.

        Parameters
        ----------
        x: ndarray
            Training samples
        y: ndarray
            Target values for samples
        dic: dict, optional
            dic indicates the columns to vectorize
            if training samples are in raw format.
        """

        if not self._initialized:
            self._initialize(x, y=y, dic=dic)

        loader = DataLoader(self._dataset,
                            batch_size=self._batch_size,
                            num_workers=self._n_jobs,
                            shuffle=True)

        for epoch in range(self._n_iter):
            for mini_batch_num, (batch_tensor, batch_ratings) in enumerate(loader):
                batch_tensor = gpu(batch_tensor, self._use_cuda)
                batch_ratings = gpu(batch_ratings, self._use_cuda)

                observations_var = Variable(batch_tensor)
                rating_var = Variable(batch_ratings)

                # forward step
                predictions = self._model(observations_var)

                # Zeroing Embeddings' gradients
                self._optimizer.zero_grad()

                # Compute Loss
                loss = self._loss_func(predictions, rating_var)

                # logging
                self._logger.log(loss, epoch, batch=mini_batch_num)

                # backward step
                loss.backward()
                # optimization step
                self._optimizer.step()
Exemple #7
0
    def predict(self, x, **kwargs):
        """
        Make predictions: given a user id, compute the recommendation
        scores for items.
        Parameters
        ----------
        x: ndarray or :class:`divmachines.fm.dataset`
            samples for which predict the ratings/rank score
        Returns
        -------
        predictions: np.array
            Predicted scores for each sample in x
        """

        self._model.train(False)
        if len(x.shape) == 1:
            x = np.array([x])

        self._init_dataset(x)

        loader = DataLoader(self._dataset,
                            batch_size=len(x),
                            shuffle=False,
                            num_workers=self._n_jobs)
        out = None
        for samples in loader:
            var = Variable(gpu(samples, self._use_cuda))
            out = self._model(var)

        return cpu(out.data).numpy().flatten()
    def _sparse_correlation(self, v, k, x, n_users, n_items, rank):

        corr = np.zeros((n_users, k, n_items - k), dtype=np.float32)

        ranking = None
        for u in tqdm(range(n_users),
                      desc="User Correlation",
                      leave=False,
                      disable=not self._verbose):
            prod_numpy = np.zeros((n_items, self._n_factors), dtype=np.float32)
            prod = gpu(torch.from_numpy(prod_numpy), self._use_cuda,
                       self._device_id)

            if ranking:
                ranking(u)
            else:
                ranking = Rank(x, rank, u, n_items, self.n_users)

            dataloader = DataLoader(ranking,
                                    pin_memory=self._pin_memory,
                                    batch_size=self._batch_size,
                                    num_workers=self._n_jobs)

            for batch, i in tqdm(dataloader,
                                 disable=not self._verbose,
                                 desc="Rank",
                                 leave=False):
                batch = gpu(batch, self._use_cuda, self._device_id)
                i = gpu(i, self._use_cuda, self._device_id)
                batch_size = list(batch.shape)[0]
                prod[i, :] = (batch.squeeze().unsqueeze(-1).expand(
                    batch_size, self.n_features, self._n_factors) * v).sum(1)

            unranked = prod[k:, :]
            ranked = prod[:k, :]

            e_corr = (
                unranked.unsqueeze(0).expand(k, n_items - k, self._n_factors) *
                ranked.unsqueeze(1).expand(k, n_items - k,
                                           self._n_factors)).sum(2)
            corr[u, :, :] = e_corr.cpu().numpy()

        return corr
    def _compute_delta_f(self, x, y, k, b, rank, users):
        # Initialize Variables
        # and other coefficients
        u_idx = Variable(
            gpu(torch.from_numpy(users), self._use_cuda, self._device_id))
        i_idx = Variable(
            gpu(torch.from_numpy(rank), self._use_cuda, self._device_id))

        n_users = rank.shape[0]
        n_items = rank.shape[1]

        wk = 1 / (2**k)
        wm = Variable(gpu(torch.from_numpy(
            np.array([1 / (2 ** m) for m in range(k)],
                     dtype=np.float32)), self._use_cuda,
            self._device_id)) \
            .unsqueeze(1).expand(k, self._n_factors)

        i_ranked = (y(i_idx[:, :k]) * wm).unsqueeze(2)
        i_unranked = y(i_idx[:, k:]).transpose(0, 1)

        users_batch = x(u_idx)
        term0 = (users_batch * i_unranked).sum(2).transpose(0, 1)

        # This block of code computes the third term of the DeltaF
        e_ranked = i_ranked.expand(n_users, k, n_items - k, self._n_factors)
        e_unranked = i_unranked \
            .transpose(0, 1) \
            .unsqueeze(1).expand(n_users,
                                 k,
                                 n_items - k,
                                 self._n_factors)

        term2 = torch.mul((e_ranked * e_unranked).sum(3).sum(1), 2 * b)

        delta_f = torch.mul(term0 - term2, wk)
        return delta_f.cpu().data.numpy()
    def predict(self, x, **kwargs):
        """
        Make predictions: given a user id, compute the recommendation
        scores for items.
        Parameters
        ----------
        x: ndarray or :class:`divmachines.fm.dataset`
            samples for which predict the ratings/rank score
        Returns
        -------
        predictions: np.array
            Predicted scores for each sample in x
        """
        if len(x.shape) == 1:
            x = np.array([x])

        self.init_predict(x)

        disable_batch = self._disable or self.batch_size is None
        if self.batch_size is None:
            self.batch_size = len(self._dataset)
        loader = DataLoader(self._dataset,
                            batch_size=self.batch_size,
                            shuffle=False,
                            num_workers=self._n_jobs)

        out = np.zeros(len(x))
        i = 0
        for batch_data in tqdm(loader,
                               desc="Prediction",
                               leave=False,
                               disable=disable_batch):
            var = Variable(gpu(batch_data,
                               self.use_cuda,
                               self._device_id))
            batch_size = batch_data.shape[0]
            out[i: i + batch_size] = \
                cpu(self._model(var), self.use_cuda).data.numpy()
            i += batch_size

        return out
Exemple #11
0
    def fit(self, x, y, dic=None, n_users=None, n_items=None):
        """
        Fit the model.
        When called repeatedly, model fitting will resume from
        the point at which training stopped in the previous fit
        call.

        Parameters
        ----------
        x: ndarray
            Training samples
        y: ndarray
            Target values for samples
        dic: dict, optional
            dic indicates the columns to make indexable.
        n_users: int, optional
            Total number of users. The model will have `n_users` rows.
            Default is None, `n_users` will be inferred from `x`.
        n_items: int, optional
            Total number of items. The model will have `n_items` columns.
            Default is None, `n_items` will be inferred from `x`.
        """

        self._initialize(x, y=y, dic=dic,
                         n_users=n_users,
                         n_items=n_items)

        disable_batch = self._disable or self.batch_size is None
        loader = DataLoader(self._dataset,
                            shuffle=True,
                            batch_size=self.batch_size,
                            num_workers=self._n_jobs)

        for epoch in tqdm(range(self.iter),
                          desc='Fitting',
                          leave=False,
                          disable=self._disable):
            mini_batch_num = 0
            acc_loss = 0.0
            if epoch > 0:
                self._dataset._initialize(x, y)
            for batch_users, batch_pos, batch_neg in tqdm(loader,
                                                           desc='Batches',
                                                           leave=False,
                                                           disable=disable_batch):
                batch_size = batch_users.shape[0]
                user_var = Variable(gpu(batch_users,
                                        self.use_cuda,
                                        self._device_id),
                                    requires_grad=False)
                pos_var = Variable(gpu(batch_pos,
                                       self.use_cuda,
                                       self._device_id),
                                   requires_grad=False)
                neg_var = Variable(gpu(batch_neg,
                                       self.use_cuda,
                                       self._device_id),
                                   requires_grad=False)

                # forward step
                pos = self._model(user_var, pos_var)
                neg = self._model(user_var, neg_var)

                # Zeroing Embeddings' gradients
                self._optimizer.zero_grad()

                # Compute Loss
                loss = self._loss_func(pos, neg)

                acc_loss += loss.data.cpu().numpy()[0] * batch_size

                self._logger.log(loss, epoch=epoch, batch=mini_batch_num, cpu=self.use_cuda)

                # backward step
                loss.backward()

                # optimization step
                self._optimizer.step()

                mini_batch_num += 1

            acc_loss /= len(self._dataset)

            self._update_no_improvement_count(acc_loss)

            if self._no_improvement_count > self._n_iter_no_change:
                if self._stopping:
                    warnings.warn("Stopping at epoch: %s with loss %s" % (epoch, acc_loss))
                    break
                else:
                    self._no_improvement_count = 0

        if self._early_stopping:
            self._prepare(dic)
 def torch_variance(self):
     var_v = torch.from_numpy(self._var)
     var = torch.nn.Embedding(var_v.size(0), var_v.size(1))
     var.weight = torch.nn.Parameter(var_v)
     var = gpu(var, self._use_cuda, self._device_id)
     return var
    def fit(self,
            x,
            y,
            dic=None,
            n_users=None,
            n_items=None,
            lengths=None):
        """
        Fit the models.
        When called repeatedly, models fitting will resume from
        the point at which training stopped in the previous fit
        call.

        Parameters
        ----------
        x: ndarray
            Training samples
        y: ndarray
            Target values for samples
        dic: dict, optional
            dic indicates the columns to vectorize
            if training samples are in raw format.
        n_users: int, optional
            Total number of users. The model will have `n_users` rows.
            Default is None, `n_users` will be inferred from `x`.
        n_items: int, optional
            Total number of items. The model will have `n_items` columns.
            Default is None, `n_items` will be inferred from `x`.
        lengths: dic, optional
            Dictionary of lengths of each feature in dic except for
            users and items.
        """

        self._initialize(x, y=y, dic=dic,
                         n_users=n_users,
                         n_items=n_items,
                         lengths=lengths)

        disable_batch = self._disable or self.batch_size is None

        loader = DataLoader(self._dataset,
                            pin_memory=self._pin_memory,
                            batch_size=self.batch_size,
                            num_workers=self._n_jobs,
                            shuffle=True)

        for epoch in tqdm(range(self.n_iter),
                          desc='Fitting',
                          leave=False,
                          disable=self._disable):
            mini_batch_num = 0
            acc_loss = 0.0
            for batch_tensor, batch_ratings in tqdm(loader,
                                                    desc='Batches',
                                                    leave=False,
                                                    disable=disable_batch):
                batch_size = batch_tensor.shape[0]
                batch_tensor = gpu(batch_tensor,
                                   self.use_cuda,
                                   self._device_id)
                batch_ratings = gpu(batch_ratings,
                                    self.use_cuda,
                                    self._device_id)

                observations_var = Variable(batch_tensor, requires_grad=False)
                rating_var = Variable(batch_ratings, requires_grad=False)

                # forward step
                predictions = self._model(observations_var)

                # Zeroing Embeddings' gradients
                self._optimizer.zero_grad()

                # Compute Loss
                loss = self._loss_func(predictions, rating_var)

                acc_loss += loss.data.cpu().numpy()[0] * batch_size

                # logging
                self._logger.log(loss, epoch, batch=mini_batch_num,
                                 cpu=self.use_cuda)

                # backward step
                loss.backward()

                # optimization step
                self._optimizer.step()

                mini_batch_num += 1

            acc_loss /= len(self._dataset)

            self._update_no_improvement_count(acc_loss)

            if self._no_improvement_count > self._n_iter_no_change:
                if self._stopping:
                    warnings.warn("Stopping at epoch: %s with loss %s" % (epoch, acc_loss))
                    break
                else:
                    self._no_improvement_count = 0

        if self._early_stopping:
            self._prepare(dic, n_users, n_items, lengths)