Beispiel #1
0
    def split(self, dataset):

        super(Holdout, self).split(dataset)

        URM = sps.csr_matrix(dataset.get_URM())

        n_users, n_items = dataset.n_users, dataset.n_items
        user_indices = []
        URM_train, URM_test, URM_validation = {}, {}, {}

        #Select apriori how to randomizely sort every user
        users_to_remove = []
        for user_id in range(n_users):
            assignment = np.random.choice(
                3,
                URM.indptr[user_id + 1] - URM.indptr[user_id],
                replace=True,
                p=[self.train_perc, self.validation_perc, self.test_perc])
            assignments = [assignment == i for i in range(3)]
            #if assignments[2].sum() <= 0:
            #No interactions in test
            #    users_to_remove.append(user_id)
            #if self.with_validation and assignments[1].sum() <= 0:
            #No interactions in validation
            #    users_to_remove.append(user_id)
            if not self.allow_cold_users and assignments[0].sum() <= 0:
                #No interactions in train
                users_to_remove.append(user_id)
            user_indices.append(assignments)

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name)
            URM = sps.csr_matrix(URM)

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            users_to_remove_index = 0
            for user_id in range(n_users):

                if users_to_remove_index < len(
                        users_to_remove
                ) and user_id == users_to_remove[users_to_remove_index]:
                    users_to_remove_index += 1
                    continue

                indices = user_indices[user_id]

                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                user_interaction_items_test = user_interaction_items[
                    indices[2]]
                user_interaction_data_test = user_interaction_data[indices[2]]

                mask = user_interaction_data_test > self.test_rating_threshold
                user_interaction_items_test = user_interaction_items_test[mask]
                user_interaction_data_test = user_interaction_data_test[mask]

                URM_test_builder.add_data_lists(
                    [user_id] * len(user_interaction_data_test),
                    user_interaction_items_test, user_interaction_data_test)

                # validation interactions
                if self.with_validation:
                    user_interaction_items_validation = user_interaction_items[
                        indices[1]]
                    user_interaction_data_validation = user_interaction_data[
                        indices[1]]

                    # Remove from validation interactions below a given threshold
                    mask = user_interaction_data_validation > self.test_rating_threshold
                    user_interaction_items_validation = user_interaction_items_validation[
                        mask]
                    user_interaction_data_validation = user_interaction_data_validation[
                        mask]

                    URM_validation_builder.add_data_lists(
                        [user_id] * len(user_interaction_data_validation),
                        user_interaction_items_validation,
                        user_interaction_data_validation)

                # Train interactions
                user_interaction_items_train = user_interaction_items[
                    indices[0]]
                user_interaction_data_train = user_interaction_data[indices[0]]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test
Beispiel #2
0
    def split(self, dataset):

        super(ColdItemsHoldout, self).split(dataset)

        n_users, n_items = dataset.n_users, dataset.n_items
        URM_train, URM_test, URM_validation = {}, {}, {}

        items_split = np.random.choice(
            3,
            n_items,
            replace=True,
            p=[self.train_perc, self.validation_perc, self.test_perc])
        train_items = np.arange(n_items)[items_split == 0]
        validation_items = np.arange(n_items)[items_split == 1]
        test_items = np.arange(n_items)[items_split == 2]

        #Select apriori how to randomizely sort every user
        users_to_remove = []

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name)
            URM = sps.csr_matrix(URM)

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            for user_id in range(n_users):

                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                indices = np.in1d(user_interaction_items,
                                  test_items,
                                  assume_unique=True)
                user_interaction_items_test = user_interaction_items[indices]
                user_interaction_data_test = user_interaction_data[indices]

                # Remove from test interactions below a given threshold
                mask = user_interaction_data_test > self.test_rating_threshold
                user_interaction_items_test = user_interaction_items_test[mask]
                user_interaction_data_test = user_interaction_data_test[mask]

                URM_test_builder.add_data_lists(
                    [user_id] * len(user_interaction_data_test),
                    user_interaction_items_test, user_interaction_data_test)

                # validation interactions
                if self.with_validation:
                    indices = np.in1d(user_interaction_items,
                                      validation_items,
                                      assume_unique=True)
                    user_interaction_items_validation = user_interaction_items[
                        indices]
                    user_interaction_data_validation = user_interaction_data[
                        indices]

                    # Remove from validation interactions below a given threshold
                    mask = user_interaction_data_validation > self.test_rating_threshold
                    user_interaction_items_validation = user_interaction_items_validation[
                        mask]
                    user_interaction_data_validation = user_interaction_data_validation[
                        mask]

                    URM_validation_builder.add_data_lists(
                        [user_id] * len(user_interaction_data_validation),
                        user_interaction_items_validation,
                        user_interaction_data_validation)

                    #if len(user_interaction_items_validation) <= 0:
                    #    users_to_remove.append(user_id)

                # Train interactions
                indices = np.in1d(user_interaction_items,
                                  train_items,
                                  assume_unique=True)
                user_interaction_items_train = user_interaction_items[indices]
                user_interaction_data_train = user_interaction_data[indices]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

                #if len(user_interaction_items_test) <= 0:
                #    users_to_remove.append(user_id)

                if not self.allow_cold_users and len(
                        user_interaction_items_train) <= 0:
                    users_to_remove.append(user_id)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test
Beispiel #3
0
    def split(self, dataset):

        super(WarmItemsKFold, self).split(dataset)

        # I can do the kfold of a slice of the initial URM!
        if self.percentage_initial_data_to_split < 1.0:
            h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split)
            dataset = h.split(dataset)[0]

        folds = []
        URM = dataset.get_URM().tocoo()
        split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging == i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocoo()
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1])))
                urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])),
                                               shape=URM.shape)
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))
        return r
Beispiel #4
0
    def split(self, dataset):

        super(ColdItemsKFold, self).split(dataset)

        folds = []
        split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging != i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocsc(copy=True)
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                for j in np.arange(URM.shape[1])[mask].tolist():
                    URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0
                URM.eliminate_zeros()
                urm[URM_name] = URM.tocsr()
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))

        return r
Beispiel #5
0
    def split(self, dataset):

        super(LeaveKOut, self).split(dataset)

        URM = sps.csr_matrix(dataset.get_URM())
        URM.sort_indices()

        split_number = 2
        if self.with_validation:
            split_number += 1

        # Min interactions at least self.k_value for each split +1 for train
        min_user_interactions = split_number * (self.k_value - 1) + 1

        users_to_preserve = np.arange(URM.shape[0])
        if not self.allow_cold_users:
            urm_threshold = URM.copy()
            urm_threshold.data[
                urm_threshold.data <= self.test_rating_threshold] = 0
            urm_threshold.eliminate_zeros()

            user_interactions = np.ediff1d(urm_threshold.tocsr().indptr)
            users_to_preserve = users_to_preserve[
                user_interactions >= min_user_interactions]

            print(
                "DataSplitterLeaveKOut: Removing {} of {} users because they have less than the {} interactions required for {} splits"
                .format(URM.shape[0] - len(users_to_preserve), URM.shape[0],
                        min_user_interactions, split_number))
        users_to_remove = np.setdiff1d(np.arange(URM.shape[0]),
                                       users_to_preserve)

        n_users, n_items = URM.shape
        user_indices = []
        URM_train, URM_test, URM_validation = {}, {}, {}

        #Select apriori how to randomizely sort every user
        for user_id in users_to_preserve.tolist():
            user_profile = URM.data[URM.indptr[user_id]:URM.
                                    indptr[user_id +
                                           1]] > self.test_rating_threshold
            test_and_val = np.random.permutation(
                np.arange(URM.indptr[user_id + 1] -
                          URM.indptr[user_id])[user_profile])

            limit = self.k_value
            if self.with_validation:
                limit = self.k_value * 2

            # Train, Test and Validation
            user_indices.append((np.setdiff1d(np.arange(len(user_profile)),
                                              test_and_val[:limit]),
                                 test_and_val[:self.k_value],
                                 test_and_val[self.k_value:limit]))

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name).tocsr()
            URM.sort_indices()

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            for i, user_id in enumerate(users_to_preserve.tolist()):
                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                indices = user_indices[i]
                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                user_interaction_items_test = user_interaction_items[
                    indices[1]]
                user_interaction_data_test = user_interaction_data[indices[1]]

                URM_test_builder.add_data_lists([user_id] * self.k_value,
                                                user_interaction_items_test,
                                                user_interaction_data_test)

                train_start = self.k_value
                # validation interactions
                if self.with_validation:
                    user_interaction_items_validation = user_interaction_items[
                        indices[2]]
                    user_interaction_data_validation = user_interaction_data[
                        indices[2]]

                    URM_validation_builder.add_data_lists(
                        [user_id] * self.k_value,
                        user_interaction_items_validation,
                        user_interaction_data_validation)
                    train_start = self.k_value * 2

                # Train interactions
                user_interaction_items_train = user_interaction_items[
                    indices[0]]
                user_interaction_data_train = user_interaction_data[indices[0]]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test