Ejemplo n.º 1
0
    def split(self, dataset):

        super(ColdItemsKFold, self).split(dataset)

        folds = []
        split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging != i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocsc(copy=True)
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                for j in np.arange(URM.shape[1])[mask].tolist():
                    URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0
                URM.eliminate_zeros()
                urm[URM_name] = URM.tocsr()
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))

        return r
Ejemplo n.º 2
0
    def split(self, dataset):

        super(WarmItemsKFold, self).split(dataset)

        # I can do the kfold of a slice of the initial URM!
        if self.percentage_initial_data_to_split < 1.0:
            h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split)
            dataset = h.split(dataset)[0]

        folds = []
        URM = dataset.get_URM().tocoo()
        split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging == i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocoo()
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1])))
                urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])),
                                               shape=URM.shape)
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))
        return r