Ejemplo n.º 1
0
    def split(self, data):
        indices = np.arange(len(data.raw_ratings))

        if self.shuffle:
            get_rng(self.random_state).shuffle(indices)

        start, stop = 0, 0
        for fold_i in range(self.n_splits):
            start = stop
            stop += len(indices) // self.n_splits

            # assign remain items to first few fold.
            if fold_i < len(indices) % self.n_splits:
                stop += 1

            raw_trainset = [
                data.raw_ratings[i]
                for i in chain(indices[:start], indices[stop:])
            ]
            raw_testset = [data.raw_ratings[i] for i in indices[start:stop]]

            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset
Ejemplo n.º 2
0
    def split(self, data):
        '''Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be devided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        '''

        test_size, train_size = self.validate_train_test_sizes(
            self.test_size, self.train_size, len(data.raw_ratings))
        rng = get_rng(self.random_state)

        for _ in range(self.n_splits):

            if self.shuffle:
                permutation = rng.permutation(len(data.raw_ratings))
            else:
                permutation = np.arange(len(data.raw_ratings))

            raw_trainset = [
                data.raw_ratings[i] for i in permutation[:test_size]
            ]
            raw_testset = [
                data.raw_ratings[i]
                for i in permutation[test_size:(test_size + train_size)]
            ]

            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset
Ejemplo n.º 3
0
    def split(self, small_data):
        """docstring
        """
        if small_data.reader.rating_scale != \
            self.large_data.reader.rating_scale:

            raise ValueError('Rating scales of large and small data '
                             'sets must match')

        if self.n_splits > len(small_data.raw_ratings) or self.n_splits < 2:
            raise ValueError('Incorrect value for n_splits={0}. '
                             'Must be >=2 and less than the number '
                             'of ratings in small dataset.'.format(
                                 len(data.raw_ratings)))

        # We use indices to avoid shuffling the original data.raw_ratings list.
        small_indices = np.arange(len(small_data.raw_ratings))
        large_indices = np.arange(len(self.large_data.raw_ratings))

        if self.shuffle:
            get_rng(self.random_state).shuffle(small_indices)
            get_rng(self.random_state).shuffle(large_indices)

        large_raw_ratings = [
            self.large_data.raw_ratings[i] for i in large_indices
        ]

        start, stop = 0, 0
        for fold_i in range(self.n_splits):
            start = stop
            stop += len(small_indices) // self.n_splits
            if fold_i < len(small_indices) % self.n_splits:
                stop += 1

            raw_testset = [small_data.raw_ratings[i] for i in \
                chain(small_indices[:start], small_indices[stop:])]
            raw_trainset = [small_data.raw_ratings[i] for i in \
                small_indices[start:stop]]
            raw_trainset += large_raw_ratings

            trainset = small_data.construct_trainset(raw_trainset)
            testset = small_data.construct_testset(raw_testset)

            yield trainset, testset
Ejemplo n.º 4
0
    def split(self, data):
        '''Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be devided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        '''

        if self.n_splits > len(data.raw_ratings) or self.n_splits < 2:
            raise ValueError('Incorrect value for n_splits={0}. '
                             'Must be >=2 and less than the number '
                             'of ratings'.format(len(data.raw_ratings)))

        # We use indices to avoid shuffling the original data.raw_ratings list.
        indices = np.arange(len(data.raw_ratings))

        if self.shuffle:
            get_rng(self.random_state).shuffle(indices)

        start, stop = 0, 0
        for fold_i in range(self.n_splits):
            start = stop
            stop += len(indices) // self.n_splits
            if fold_i < len(indices) % self.n_splits:
                stop += 1

            raw_trainset = [
                data.raw_ratings[i]
                for i in chain(indices[:start], indices[stop:])
            ]
            raw_testset = [data.raw_ratings[i] for i in indices[start:stop]]

            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset
Ejemplo n.º 5
0
    def split(self, data):
        '''Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be devided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        '''

        rng = get_rng(self.random_state)

        for _ in range(self.n_repeats):
            cv = KFold(n_splits=self.n_splits, random_state=rng, shuffle=True)
            for trainset, testset in cv.split(data):
                yield trainset, testset
Ejemplo n.º 6
0
def test_get_rng():

    # assert two RNG with same int are the same
    rng_a = get_rng(12)
    rng_b = get_rng(12)
    a = [rng_a.rand() for _ in range(10)]
    b = [rng_b.rand() for _ in range(10)]
    assert a == b

    # assert passing an int returns the corresponding numpy rng instance
    rng_a = get_rng(12)
    rng_b = np.random.RandomState(12)

    a = [rng_a.rand() for _ in range(10)]
    b = [rng_b.rand() for _ in range(10)]
    assert a == b

    # Make sure this is ok
    get_rng(None)

    with pytest.raises(ValueError):
        get_rng(23.2)
    with pytest.raises(ValueError):
        get_rng('bad')
Ejemplo n.º 7
0
    def split(self, data):
        '''Generator function to iterate over trainsets and testsets.

        Args:
            data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
                ratings that will be devided into trainsets and testsets.

        Yields:
            tuple of (trainset, testset)
        '''

        # map ratings to the users ids
        user_ratings = defaultdict(list)
        for uid, iid, r_ui, _ in data.raw_ratings:
            user_ratings[uid].append((uid, iid, r_ui, None))

        rng = get_rng(self.random_state)

        for _ in range(self.n_splits):
            # for each user, randomly choose a rating and put it in the
            # testset.
            raw_trainset, raw_testset = [], []
            for uid, ratings in iteritems(user_ratings):
                if len(ratings) > self.min_n_ratings:
                    i = rng.randint(0, len(ratings))
                    raw_testset.append(ratings[i])
                    raw_trainset += [
                        rating for (j, rating) in enumerate(ratings) if j != i
                    ]

            if not raw_trainset:
                raise ValueError('Could not build any trainset. Maybe '
                                 'min_n_ratings is too high?')
            trainset = data.construct_trainset(raw_trainset)
            testset = data.construct_testset(raw_testset)

            yield trainset, testset
Ejemplo n.º 8
0
    def SGD_momentum(self, trainset):
        momenum_all = 0.1
        lr_bu = self.lr_bu
        lr_bi = self.lr_bi
        lr_pu = self.lr_pu
        lr_qi = self.lr_qi
        lr_yj = self.lr_yj
        reg_bu = self.reg_bu
        reg_bi = self.reg_bi
        reg_pu = self.reg_pu
        reg_qi = self.reg_qi
        reg_yj = self.reg_yj
        mom_pu = momenum_all
        mom_qi = momenum_all
        mom_yj = momenum_all
        mom_bu = momenum_all
        mom_bi = momenum_all

        bu = np.zeros(trainset.n_users, np.double)
        bi = np.zeros(trainset.n_items, np.double)
        global_mean = self.trainset.global_mean

        # initialize factors randomly
        rng = get_rng(self.random_state)
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_users, self.n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, self.n_factors))
        yj = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, self.n_factors))

        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print(" processing epoch {}".format(current_epoch))
            v_bu = 0
            v_bi = 0
            v_pu = 0
            v_qi = 0
            v_yj = 0
            # u = user, i = item = movie, r = rating
            for u, i, r in trainset.all_ratings():
                # items rated by u. This is COSTLY
                Iu = [j for (j, _) in trainset.ur[u]]
                sqrt_Iu = np.sqrt(len(Iu))

                # compute user implicit feedback
                u_impl_fdb = np.zeros(self.n_factors, np.double)
                for j in Iu:
                    for f in range(self.n_factors):
                        u_impl_fdb[f] += yj[j, f] / sqrt_Iu

                # compute current residual
                residual = r - global_mean - bu[u] - bi[i] - np.dot(
                    pu[u, :], qi[i, :] - u_impl_fdb[f])

                # update biases
                v_bu_prior = v_bu
                v_bu = mom_bu * v_bu_prior + (1 - mom_bu) * residual
                bu[u] += lr_bu * (v_bu - reg_bu * bu[u])
                v_bi_prior = v_bi
                v_bi = mom_bi * v_bi_prior + (1 - mom_bi) * residual
                bi[i] += lr_bi * (v_bi - reg_bi * bi[i])

                # update factors
                v_pu_prior = v_pu
                v_pu = mom_pu * v_pu_prior + (1 - mom_pu) * residual
                pu[u, :] += lr_pu * (v_pu * qi[i, :] - reg_pu * pu[u, :])
                v_qi_prior = v_qi
                v_qi = mom_qi * v_qi_prior + (1 - mom_qi) * residual
                qi[i, :] += lr_qi * (v_qi * pu[u, :] - reg_qi * qi[i, :])
                v_yj_prior = v_yj
                v_yj = mom_yj * v_yj_prior + (1 - mom_yj) * residual
                yj[j, :] += lr_yj * (v_yj * qi[i, :] / sqrt_Iu -
                                     reg_yj * yj[j, :])

        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi
        self.yj = yj
Ejemplo n.º 9
0
    def SGD_momentum(self, trainset):
        momenum_all = 0.1
        lr_pu = self.lr_pu
        lr_qi = self.lr_qi
        reg_pu = self.reg_pu
        reg_qi = self.reg_qi
        mom_pu = momenum_all
        mom_qi = momenum_all

        # define global_mean, regularization parameters, and initialize baselines to 0
        bu = np.zeros(trainset.n_users, np.double)
        bi = np.zeros(trainset.n_items, np.double)
        global_mean = 0
        lr_bu = 0
        lr_bi = 0
        reg_bu = 0
        reg_bi = 0
        mom_bu = momenum_all
        mom_bi = momenum_all
        if self.biased:
            global_mean = self.trainset.global_mean
            lr_bu = self.lr_bu
            lr_bi = self.lr_bi
            reg_bu = self.reg_bu
            reg_bi = self.reg_bi

        # initialize factors randomly
        rng = get_rng(self.random_state)
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_users, self.n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, self.n_factors))

        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))
            # u = user, i = item = movie, r = rating
            v_bu = 0
            v_bi = 0
            v_pu = 0
            v_qi = 0
            for u, i, r in trainset.all_ratings():
                # compute current residual
                residual = r - global_mean - bu[u] - bi[i] - np.dot(
                    pu[u, :], qi[i, :])

                # update biases
                if self.biased:
                    v_bu_prior = v_bu
                    v_bu = mom_bu * v_bu_prior + (1 - mom_bu) * residual
                    bu[u] += lr_bu * (v_bu - reg_bu * bu[u])
                    v_bi_prior = v_bi
                    v_bi = mom_bi * v_bi_prior + (1 - mom_bi) * residual
                    bi[i] += lr_bi * (v_bi - reg_bi * bi[i])

                # update factors
                v_pu_prior = v_pu
                v_pu = mom_pu * v_pu_prior + (1 - mom_pu) * residual
                pu[u, :] += lr_pu * (v_pu * qi[i, :] - reg_pu * pu[u, :])
                v_qi_prior = v_qi
                v_qi = mom_qi * v_qi_prior + (1 - mom_qi) * residual
                qi[i, :] += lr_qi * (v_qi * pu[u, :] - reg_qi * qi[i, :])

        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi
Ejemplo n.º 10
0
    def sgd(self, trainset):

        global_mean = self.trainset.global_mean

        lr_bu = self.lr_bu

        lr_bi = self.lr_bi

        lr_pu = self.lr_pu

        lr_qi = self.lr_qi

        reg_bu = self.reg_bu

        reg_bi = self.reg_bi

        reg_pu = self.reg_pu

        reg_qi = self.reg_qi

        rng = get_rng(self.random_state)

        bu = np.zeros(trainset.n_users, np.double)
        bi = np.zeros(trainset.n_items, np.double)
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_users, self.n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, self.n_factors))

        if not self.biased:
            global_mean = 0

        for current_epoch in range(self.n_epochs):
            print("im here")
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))
            for u, i, r in trainset.all_ratings():
                print("hey")

                # compute current error
                dot = 0  # <q_i, p_u>
                for f in range(self.n_factors):
                    print("yo")
                    dot += qi[i, f] * pu[u, f]
                err = r - (global_mean + bu[u] + bi[i] + dot)

                # update biases
                if self.biased:
                    bu[u] += lr_bu * (err - reg_bu * bu[u])
                    bi[i] += lr_bi * (err - reg_bi * bi[i])

                # update factors
                for f in range(self.n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += lr_pu * (err * qif - reg_pu * puf)
                    qi[i, f] += lr_qi * (err * puf - reg_qi * qif)

        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi
Ejemplo n.º 11
0
    def partial_fit(self, new_ratings, random_state=None, verbose=True):
        rng = get_rng(random_state=random_state)

        # start with the last trained model
        bu = self.bu
        bi = self.bi
        pu = self.pu
        qi = self.qi

        if not self.biased:
            global_mean = 0
        else:
            global_mean = self.trainset.global_mean

        for current_epoch in range(self.n_epochs):
            for u, i, r in new_ratings:
                # compute current error
                dot = 0  # <q_i, p_u>

                # if user is new, append new row to `pu`, new column to `bu`
                if u > len(pu) - 1:
                    pu = np.concatenate(
                        (pu,
                         rng.normal(self.init_mean, self.init_std_dev,
                                    (1, self.n_factors))),
                        axis=0)
                    bu = np.append(bu, 0)

                    # same for item
                if i > len(qi) - 1:
                    qi = np.concatenate(
                        (qi,
                         rng.normal(self.init_mean, self.init_std_dev,
                                    (1, self.n_factors))),
                        axis=0)
                    bi = np.append(bi, 0)

                for f in range(self.n_factors):
                    dot += qi[i, f] * pu[u, f]

                # compute the error
                err = r - (global_mean + bu[u] + bi[i] + dot)

                if verbose:
                    # sys.stdout.write('\r')
                    print(
                        f'Epoch {current_epoch + 1}/{self.n_epochs}: loss: {abs(err)}'
                    )  # , end='')

                # update biases
                if self.biased:
                    bu[u] += self.lr_bu * (err - self.reg_bu * bu[u])
                    bi[i] += self.lr_bi * (err - self.reg_bi * bi[i])

                # update factors
                for f in range(self.n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += self.lr_pu * (err * qif - self.reg_pu * puf)
                    qi[i, f] += self.lr_qi * (err * puf - self.reg_qi * qif)

        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi

        return self