コード例 #1
0
    def __init__(self):

        super(NetflixPrizeReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/NetflixPrize/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("NetflixPrizeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("NetflixPrizeReader: Pre-splitted data not found, building new one")

            data_reader = NetflixPrizeReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()


            URM_all = sps.coo_matrix(URM_all)

            dict_for_dataframe = {"userId": URM_all.row,
                                  "movieId": URM_all.col,
                                  "rating": URM_all.data
                                }

            URM_all_dataframe = pd.DataFrame(data = dict_for_dataframe)


            self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(URM_all_dataframe,
                                                                                                                         n_heldout_users = 40000)


            n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0], self.URM_test.shape[0])
            n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1], self.URM_test.shape[1])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)
            self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
            self.URM_test = reshapeSparse(self.URM_test, newShape)



            data_dict = {
                "URM_train": self.URM_train,
                "URM_train_all": self.URM_train_all,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)




            print("NetflixPrizeReader: Dataset loaded")
コード例 #2
0
    def __init__(self, split_type="cold_user"):

        super(Movielens20MReader, self).__init__()

        assert split_type in ["cold_user", "warm_user"]

        pre_splitted_path = "Data_manager_split_datasets/Movielens20M/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data" + "_" + split_type

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens20MReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens20MReader: Pre-splitted data not found, building new one")

            data_reader = Movielens20MReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()

            if split_type == "cold_user":

                URM_all = sps.coo_matrix(URM_all)

                dict_for_dataframe = {"userId": URM_all.row,
                                      "movieId": URM_all.col,
                                      "rating": URM_all.data
                                      }

                URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe)

                self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(
                    URM_all_dataframe,
                    n_heldout_users=10000)

                n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0],
                             self.URM_test.shape[0])
                n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1],
                             self.URM_test.shape[1])

                newShape = (n_rows, n_cols)

                self.URM_test = reshapeSparse(self.URM_test, newShape)
                self.URM_train = reshapeSparse(self.URM_train, newShape)
                self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
                self.URM_test = reshapeSparse(self.URM_test, newShape)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_train_all": self.URM_train_all,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation,

                }



            elif split_type == "warm_user":

                URM_all = sps.csr_matrix(URM_all)
                users_to_keep = np.ediff1d(URM_all.indptr) >= 4
                URM_all = URM_all[users_to_keep, :]

                URM_all = sps.csc_matrix(URM_all)
                items_to_keep = np.ediff1d(URM_all.indptr) >= 1
                URM_all = URM_all[:, items_to_keep]

                URM_all = sps.csr_matrix(URM_all)

                self.URM_train, self.URM_validation, self.URM_test, _ = split_train_validation_test_negative_leave_one_out_user_wise(
                    URM_all)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation

                }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens20MReader: Dataset loaded")