def _load_data_file(self, filePath, separator=" "):

        URM_builder = IncrementalSparseMatrix(auto_create_row_mapper=False,
                                              auto_create_col_mapper=False)

        fileHandle = open(filePath, "r")
        user_index = 0

        for line in fileHandle:

            if (user_index % 1000000 == 0):
                print("Processed {} cells".format(user_index))

            if (len(line)) > 1:

                line = line.replace("\n", "")
                line = line.split(separator)

                if len(line) > 0:

                    if line[0] != "0":

                        line = [int(line[i]) for i in range(len(line))]

                        URM_builder.add_single_row(user_index,
                                                   line[1:],
                                                   data=1.0)

            user_index += 1

        fileHandle.close()

        return URM_builder
Ejemplo n.º 2
0
def split_train_validation_percentage_random_holdout(URM_train,
                                                     train_percentage=0.8):

    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                n_cols=num_items)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                     n_cols=num_items)

    URM_train = sps.coo_matrix(URM_train)

    train_mask = np.random.rand(URM_train.nnz) <= train_percentage
    validation_mask = np.logical_not(train_mask)

    URM_train_builder.add_data_lists(URM_train.row[train_mask],
                                     URM_train.col[train_mask],
                                     URM_train.data[train_mask])
    URM_validation_builder.add_data_lists(URM_train.row[validation_mask],
                                          URM_train.col[validation_mask],
                                          URM_train.data[validation_mask])

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
Ejemplo n.º 3
0
    def __init__(self, path):
        '''
        Constructor
        '''
        trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        testRatings = self.load_rating_file_as_matrix(path + ".test.rating")
        testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(testRatings) == len(testNegatives)

        self.num_users, self.num_items = trainMatrix.shape

        from Base.Recommender_utils import reshapeSparse

        self.URM_train = trainMatrix.tocsr()
        self.URM_test = testRatings.tocsr()

        shape = (max(self.URM_train.shape[0], self.URM_test.shape[0]),
                 max(self.URM_train.shape[1], self.URM_test.shape[1]))

        self.URM_train = reshapeSparse(self.URM_train, shape)
        self.URM_test = reshapeSparse(self.URM_test, shape)

        URM_test_negatives_builder = IncrementalSparseMatrix(n_rows=shape[0],
                                                             n_cols=shape[1])

        for user_index in range(len(testNegatives)):
            user_test_items = testNegatives[user_index]

            URM_test_negatives_builder.add_single_row(user_index,
                                                      user_test_items,
                                                      data=1.0)

        self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
Ejemplo n.º 4
0
    def _loadICM (self, filePath, header = False, separator="|"):

        ICM_builder = IncrementalSparseMatrix(auto_create_col_mapper=True, auto_create_row_mapper=True)

        fileHandle = open(filePath, "r", encoding="latin1")
        numCells = 0

        if header:
            fileHandle.readline()

        for line in fileHandle:
            numCells += 1
            if (numCells % 1000000 == 0):
                print("Processed {} cells".format(numCells))

            if (len(line)) > 1:
                line = line.split(separator)

                line[-1] = line[-1].replace("\n", "")

                genre_list = [int(genre_bit) for genre_bit in line[5:]]
                item_id = int(line[0])-1

                ICM_builder.add_data_lists([item_id]*len(genre_list), genre_list, [1.0]*len(genre_list))


        fileHandle.close()

        return  ICM_builder
Ejemplo n.º 5
0
    def test_IncrementalSparseMatrix_add_rows(self):

        import numpy as np

        n_rows = 100
        n_cols = 200

        randomMatrix = sps.random(n_rows, n_cols, density=0.01, format="csr")

        incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows,
                                                    n_cols=n_cols)

        for row in range(n_rows):

            row_data = randomMatrix.indices[randomMatrix.
                                            indptr[row]:randomMatrix.indptr[row
                                                                            +
                                                                            1]]

            incrementalMatrix.add_single_row(row, row_data, 5.0)

        randomMatrix.data = np.ones_like(randomMatrix.data) * 5.0

        randomMatrix_incremental = incrementalMatrix.get_SparseMatrix()

        assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
Ejemplo n.º 6
0
def split_train_validation_percentage_user_wise(URM_train, train_percentage=0.1, verbose=True):
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    # ensure to use csr matrix or we get big problem
    URM_train = URM_train.tocsr()

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)

    user_no_item_train = 0
    user_no_item_validation = 0

    for user_id in range(URM_train.shape[0]):

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]

        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = round(user_profile_length * train_percentage)

        if n_train_items == len(user_profile_items) and n_train_items > 1:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            user_no_item_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            user_no_item_validation += 1

        URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings)
        URM_validation_builder.add_data_lists([user_id] * len(validation_items), validation_items, validation_ratings)

    if user_no_item_train != 0:
        print("Warning split: {} users with 0 train items ({} total users)".format(user_no_item_train,
                                                                                   URM_train.shape[0]))
    if user_no_item_validation != 0:
        print("Warning split: {} users with 0 validation items ({} total users)".format(user_no_item_validation,
                                                                                        URM_train.shape[0]))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
Ejemplo n.º 7
0
def split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=50, verbose=True,
                                                                 at_least_n_train_items_test=0,
                                                                 at_least_n_train_items_validation=0):
    """
    This function creates a Train, Test, Validation split with negative items sampled
    The split is perfomed user-wise, hold 1 out for validation and test
    :param URM_all:
    :param negative_items_per_positive:
    :return:
    """

    URM_all = sps.csr_matrix(URM_all)

    n_rows, n_cols = URM_all.shape

    print('Creation test...')
    URM_train_all, URM_test = split_train_validation_leave_one_out_user_wise(URM_all,
                                                                             at_least_n_train_items=at_least_n_train_items_test,
                                                                             verbose=verbose)

    print('Creation validation...')
    URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train_all,
                                                                               at_least_n_train_items=at_least_n_train_items_validation,
                                                                               verbose=verbose)

    URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)

    all_items = np.arange(0, n_cols, dtype=np.int)

    for user_index in range(URM_train_all.shape[0]):

        if user_index % 10000 == 0:
            print("split_data_train_validation_test_negative: user {} of {}".format(user_index, URM_all.shape[0]))

        start_pos = URM_all.indptr[user_index]
        end_pos = URM_all.indptr[user_index + 1]

        user_profile = URM_all.indices[start_pos:end_pos]

        unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True)

        unobserved_items = all_items[unobserved_index]
        np.random.shuffle(unobserved_items)

        n_test_items = URM_test.indptr[user_index + 1] - URM_test.indptr[user_index]

        num_negative_items = n_test_items * negative_items_per_positive

        if num_negative_items > len(unobserved_items):
            print(
                "split_data_train_validation_test_negative: WARNING number of negative to sample for user {} is greater than available negative items {}".format(
                    num_negative_items, len(unobserved_items)))
            num_negative_items = min(num_negative_items, len(unobserved_items))

        URM_negative_builder.add_single_row(user_index, unobserved_items[:num_negative_items], 1.0)

    URM_negative = URM_negative_builder.get_SparseMatrix()

    return URM_train, URM_validation, URM_test, URM_negative
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1):
    """
    The function splits an URM in two matrices selecting the number of interactions globally
    :param URM_all:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage)


    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_all.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)


    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int)
    np.random.shuffle(indices_for_sampling)

    n_train_interactions = round(URM_all.nnz * train_percentage)

    indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]]
    indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]]


    URM_train_builder.add_data_lists(URM_train.row[indices_for_train],
                                     URM_train.col[indices_for_train],
                                     URM_train.data[indices_for_train])

    URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation],
                                          URM_train.col[indices_for_validation],
                                          URM_train.data[indices_for_validation])


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    URM_validation = sps.csr_matrix(URM_validation)

    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)
    user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users))
    if user_no_item_validation != 0:
        print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users))


    return URM_train, URM_validation
Ejemplo n.º 9
0
def split_train_validation_leave_one_out_user_wise(URM_train, verbose=True, at_least_n_train_items=0):
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)

    count_train = 0
    count_validation = 0
    for user_id in range(URM_train.shape[0]):

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]

        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = user_profile_length

        if n_train_items > at_least_n_train_items:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            count_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            count_validation += 1

        URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings)
        URM_validation_builder.add_data_lists([user_id] * len(validation_items), validation_items, validation_ratings)

    if count_train > 0:
        print("{} users with 0 train items".format(count_train))
    if count_validation > 0:
        print("{} users with 0 validation items".format(count_validation))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
Ejemplo n.º 10
0
def split_data_train_validation_test_negative_user_wise(URM_all, negative_items_per_positive=50):
    """
    This function creates a Train, Test, Validation split with negative items sampled
    The split is perfomed user-wise, 20% is test, 80% is train. Train is further divided in 90% final train and 10% validation
    :param URM_all:
    :param negative_items_per_positive:
    :return:
    """

    URM_all = sps.csr_matrix(URM_all)

    n_rows, n_cols = URM_all.shape

    URM_train_all, URM_test = split_train_validation_percentage_user_wise(URM_all, train_percentage=0.8)

    URM_train, URM_validation = split_train_validation_percentage_user_wise(URM_train_all, train_percentage=0.9)

    URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)

    all_items = np.arange(0, n_cols, dtype=np.int)

    for user_index in range(URM_train_all.shape[0]):

        if user_index % 10000 == 0:
            print("split_data_train_validation_test_negative: user {} of {}".format(user_index, URM_all.shape[0]))

        start_pos = URM_all.indptr[user_index]
        end_pos = URM_all.indptr[user_index + 1]

        user_profile = URM_all.indices[start_pos:end_pos]

        unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True)

        unobserved_items = all_items[unobserved_index]
        np.random.shuffle(unobserved_items)

        n_test_items = URM_test.indptr[user_index + 1] - URM_test.indptr[user_index]

        num_negative_items = n_test_items * negative_items_per_positive

        if num_negative_items > len(unobserved_items):
            print(
                "split_data_train_validation_test_negative: WARNING number of negative to sample for user {} is greater than available negative items {}".format(
                    num_negative_items, len(unobserved_items)))
            num_negative_items = min(num_negative_items, len(unobserved_items))

        URM_negative_builder.add_single_row(user_index, unobserved_items[:num_negative_items], 1.0)

    URM_negative = URM_negative_builder.get_SparseMatrix()

    return URM_train, URM_validation, URM_test, URM_negative
Ejemplo n.º 11
0
    def test_IncrementalSparseMatrix_add_lists(self):

        n_rows = 100
        n_cols = 200

        randomMatrix = sps.random(n_rows, n_cols, density=0.01, format="coo")

        incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows,
                                                    n_cols=n_cols)

        incrementalMatrix.add_data_lists(randomMatrix.row.copy(),
                                         randomMatrix.col.copy(),
                                         randomMatrix.data.copy())

        randomMatrix_incremental = incrementalMatrix.get_SparseMatrix()

        assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
Ejemplo n.º 12
0
def load_CSV_into_SparseBuilder(filePath, header=False, separator="::"):

    matrixBuilder = IncrementalSparseMatrix(auto_create_col_mapper=True,
                                            auto_create_row_mapper=True)

    fileHandle = open(filePath, "r")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            try:
                user_id = line[0]
                item_id = line[1]

                try:
                    value = float(line[2])

                    if value != 0.0:

                        matrixBuilder.add_data_lists([user_id], [item_id],
                                                     [value])

                except ValueError:
                    print(
                        "load_CSV_into_SparseBuilder: Cannot parse as float value '{}'"
                        .format(line[2]))

            except IndexError:
                print(
                    "load_CSV_into_SparseBuilder: Index out of bound in line '{}'"
                    .format(line))

    fileHandle.close()

    return matrixBuilder.get_SparseMatrix(
    ), matrixBuilder.get_column_token_to_id_mapper(
    ), matrixBuilder.get_row_token_to_id_mapper()
    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Gowalla/"

            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "gowalla")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Gowalla files not found, please download them and put them in this folder '{}', url: {}"
                    .format(self.DATASET_NAME, compressed_file_folder,
                            self.DATASET_URL))
                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            ).tocsr()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            #
            # # NOT USED
            # # elif not time_split: #create from full dataset with random leave one out from LINKED dateset in the article since timestamp is not present.
            # #
            # #     data_reader = GowallaGithubReader_DataManager()
            # #     loaded_dataset = data_reader.load_data()
            # #
            # #     URM_all = loaded_dataset.get_URM_all()
            # #
            # #     URM_all.eliminate_zeros()
            # #
            # #     URM_all.data = np.ones_like(URM_all.data)
            # #
            # #     #use this function 2 time because the order could change slightly the number of final interactions
            # #     #with this order we get the same number of interactions as in the paper
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            # #
            # #     URM_train, URM_validation, URM_test, URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=999,
            # #                                                                                                                                          at_least_n_train_items_test=0, at_least_n_train_items_validation=0,
            # #                                                                                                                                          verbose=True)
            # #     URM_timestamp = sps.csc_matrix(([],([],[])), shape=URM_train.shape)
            #
            # else: # create from full dataset with leave out one time wise from ORIGINAL full dateset
            #     data_reader = GowallaReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     # use this function 2 time because the order could change slightly the number of final interactions
            #     # with this order we get the same number of interactions as in the paper
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            #
            #     URM_timestamp = URM_all.copy()
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
Ejemplo n.º 14
0
def split_train_in_two_percentage_user_wise(URM_train,
                                            train_percentage=0.1,
                                            verbose=False):
    """
    The function splits an URM in two matrices selecting the number of interactions one user at a time
    :param URM_train:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(
        train_percentage)

    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    # ensure to use csr matrix or we get big problem
    URM_train = URM_train.tocsr()

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                n_cols=num_items,
                                                auto_create_col_mapper=False,
                                                auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(
        n_rows=num_users,
        n_cols=num_items,
        auto_create_col_mapper=False,
        auto_create_row_mapper=False)

    user_no_item_train = 0
    user_no_item_validation = 0

    for user_id in range(URM_train.shape[0]):

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]

        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = round(user_profile_length * train_percentage)

        if n_train_items == len(user_profile_items) and n_train_items > 1:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[
            indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[
            indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[
            indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            user_no_item_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            user_no_item_validation += 1

        URM_train_builder.add_data_lists([user_id] * len(train_items),
                                         train_items, train_ratings)
        URM_validation_builder.add_data_lists(
            [user_id] * len(validation_items), validation_items,
            validation_ratings)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(
            user_no_item_train, user_no_item_train / num_users * 100,
            num_users))
    if user_no_item_validation != 0:
        print(
            "Warning: {} ({:.2f} %) of {} users have no sampled items".format(
                user_no_item_validation,
                user_no_item_validation / num_users * 100, num_users))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
Ejemplo n.º 15
0
    def __init__(self, pre_splitted_path, original=True):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            if original:

                URM_path = 'Conferences/IJCAI/DMF_original/data_www/Amazon_ratings_Digital_Music_pruned.txt'
                #
                # dataFile = open(URM_path, "r")
                #
                # # textData = dataFile.readlines()
                # dataFile.close()
                #
                # u_map = {}
                # discarded = 0
                # for line in tqdm(textData):
                #     line = line.split(' ')
                #     u, i, rating, new_time = int(line[0]), int(line[1]), float(line[2]), int(line[3])
                #
                #     # convert u id and i id in integer starting from 0 and initialize u_map
                #     if u not in u_map:
                #         u_map[u] = {}
                #
                #     if i not in u_map[u]:
                #         u_map[u][i] = [rating, new_time]
                #     else:  # rating already exist, keep the most recent timestamp
                #         discarded += 1
                #         current_time = u_map[u][i][1]
                #         if new_time > current_time:
                #             u_map[u][i] = [rating, new_time]
                #
                # print('Merged {} interactions, kept the most recent timestamps'.format(discarded))
                #
                # UTM_builder = IncrementalSparseMatrix()
                # URM_builder = IncrementalSparseMatrix()
                #
                # for u in u_map:
                #     items, ratings, timestamps = [], [], []
                #     for i in u_map[u]:
                #         items.append(i)
                #         timestamps.append(u_map[u][i][1])
                #         ratings.append(u_map[u][i][0])
                #     UTM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=timestamps)
                #     URM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=ratings)
                #

                URM_rating_builder = IncrementalSparseMatrix(
                    auto_create_col_mapper=True, auto_create_row_mapper=True)
                URM_timestamp_builder = IncrementalSparseMatrix(
                    auto_create_col_mapper=True, auto_create_row_mapper=True)

                # URM_duplicate_assert_builder = IncrementalSparseMatrix( auto_create_col_mapper = True, auto_create_row_mapper = True)

                df_original = pd.read_csv(filepath_or_buffer=URM_path,
                                          sep=" ",
                                          header=None,
                                          dtype={
                                              0: int,
                                              1: int,
                                              2: float,
                                              3: int
                                          })

                df_original.columns = [
                    'userId', 'itemId', 'rating', 'timestamp'
                ]

                userId_list = df_original['userId'].values
                itemId_list = df_original['itemId'].values
                rating_list = df_original['rating'].values
                timestamp_list = df_original['timestamp'].values

                URM_rating_builder.add_data_lists(userId_list, itemId_list,
                                                  rating_list)
                URM_timestamp_builder.add_data_lists(userId_list, itemId_list,
                                                     timestamp_list)

                # URM_duplicate_assert_builder.add_data_lists(userId_list, itemId_list, np.ones_like(rating_list))
                # URM_duplicate_assert = URM_duplicate_assert_builder.get_SparseMatrix()
                #
                # assert np.all(URM_duplicate_assert.data == 1.0), "Duplicates detected"

                # Check if duplicates exist
                num_unique_user_item_ids = df_original.drop_duplicates(
                    ['userId', 'itemId'], keep='first', inplace=False).shape[0]
                assert num_unique_user_item_ids == len(
                    userId_list), "Duplicate (user, item) values found"

                URM_timestamp = URM_timestamp_builder.get_SparseMatrix()
                URM_all = URM_rating_builder.get_SparseMatrix()

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

                # We want the validation to be sampled at random, not as the last interaction
                URM_train = URM_train + URM_validation
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_train, verbose=False)

            else:
                # create from full dataset with leave out one time wise from ORIGINAL full dateset
                data_reader = AmazonMusicReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_from_name("URM_all")
                URM_timestamp = loaded_dataset.get_URM_from_name(
                    "URM_timestamp")

                # use this function 2 time because the order could change slightly the number of final interactions
                URM_all = filter_urm(URM_all,
                                     user_min_number_ratings=1,
                                     item_min_number_ratings=5)
                URM_all = filter_urm(URM_all,
                                     user_min_number_ratings=20,
                                     item_min_number_ratings=1)
                URM_timestamp = filter_urm(URM_timestamp,
                                           user_min_number_ratings=1,
                                           item_min_number_ratings=5)
                URM_timestamp = filter_urm(URM_timestamp,
                                           user_min_number_ratings=20,
                                           item_min_number_ratings=1)

                URM_timestamp = URM_timestamp

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

                # We want the validation to be sampled at random, not as the last interaction
                URM_train = URM_train + URM_validation
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
Ejemplo n.º 16
0
    def __init__(self):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/"
        pre_splitted_filename = "splitted_data"

        ratings_file_name = "ratings_Amazon_Instant_Video.csv"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one"
            )

            folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

            downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name)

            # read Amazon Instant Video
            df = pd.read_csv(folder_path + ratings_file_name,
                             sep=',',
                             header=None,
                             names=['user', 'item', 'rating',
                                    'timestamp'])[['user', 'item', 'rating']]

            # keep only ratings = 5
            URM_train_builder = IncrementalSparseMatrix(
                auto_create_col_mapper=True, auto_create_row_mapper=True)
            URM_train_builder.add_data_lists(df['user'].values,
                                             df['item'].values,
                                             df['rating'].values)
            URM_all = URM_train_builder.get_SparseMatrix()

            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # keep only users with at least 5 ratings
            URM_all = ut.filter_urm(URM_all,
                                    user_min_number_ratings=5,
                                    item_min_number_ratings=1)

            # create train - test - validation

            URM_train_original, self.URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_AmazonInstantVideo: Dataset loaded")

        ut.print_stat_datareader(self)
Ejemplo n.º 17
0
    def __init__(self, pre_splitted_path, original=True):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Yelp/"

            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "yelp")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            # else:
            #     data_reader = YelpReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     URM_timestamp = URM_all.copy()
            #
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            shutil.rmtree(decompressed_file_folder + "decompressed/",
                          ignore_errors=True)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
Ejemplo n.º 18
0
    def __init__(self):

        super(PinterestICCVReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/WWW/NeuMF_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Pinterest: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Pinterest: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github(
                "Conferences/WWW/NeuMF_github/Data/pinterest-20")

            self.URM_train_original, self.URM_test = dataset.trainMatrix, dataset.testRatings

            self.URM_train_original = self.URM_train_original.tocsr()
            self.URM_test = self.URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(self.URM_train_original.shape[0],
                         self.URM_test.shape[0]),
                     max(self.URM_train_original.shape[1],
                         self.URM_test.shape[1]))

            self.URM_train_original = reshapeSparse(self.URM_train_original,
                                                    shape)
            self.URM_test = reshapeSparse(self.URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            )

            self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise(
                self.URM_train_original.copy())

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_Pinterest: Dataset loaded")

        print("N_items {}, n_users {}".format(self.URM_train.shape[1],
                                              self.URM_train.shape[0]))
Ejemplo n.º 19
0
    def _loadURM(self):

        from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

        numCells = 0
        URM_builder = IncrementalSparseMatrix(auto_create_col_mapper=True,
                                              auto_create_row_mapper=True)

        for current_split in [1, 2, 3, 4]:

            current_split_path = self.dataFile.extract(
                "combined_data_{}.txt".format(current_split),
                path=self.decompressed_zip_file_folder + "decompressed/")

            fileHandle = open(current_split_path, "r")

            print("NetflixPrizeReader: loading split {}".format(current_split))

            currentMovie_id = None

            for line in fileHandle:

                if numCells % 1000000 == 0 and numCells != 0:
                    print("Processed {} cells".format(numCells))

                if (len(line)) > 1:

                    line_split = line.split(",")

                    # If line has 3 components, it is a 'user_id,rating,date' row
                    if len(line_split) == 3 and currentMovie_id != None:

                        user_id = line_split[0]

                        URM_builder.add_data_lists([user_id],
                                                   [currentMovie_id],
                                                   [float(line_split[1])])

                        numCells += 1

                    # If line has 1 component, it MIGHT be a 'item_id:' row
                    elif len(line_split) == 1:
                        line_split = line.split(":")

                        # Confirm it is a 'item_id:' row
                        if len(line_split) == 2:
                            currentMovie_id = line_split[0]

                        else:
                            print("Unexpected row: '{}'".format(line))

                    else:
                        print("Unexpected row: '{}'".format(line))

            fileHandle.close()

            print("NetflixPrizeReader: cleaning temporary files")

            shutil.rmtree(self.decompressed_zip_file_folder + "decompressed/",
                          ignore_errors=True)

        return URM_builder.get_SparseMatrix(
        ), URM_builder.get_column_token_to_id_mapper(
        ), URM_builder.get_row_token_to_id_mapper()
def load_CSV_into_SparseBuilder (filePath, header = False, separator="::", timestamp = False, remove_duplicates = False,
                                 custom_user_item_rating_columns = None):

    URM_all_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True)
    URM_timestamp_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True)

    if timestamp:
        dtype={0:str, 1:str, 2:float, 3:float}
        columns = ['userId', 'itemId', 'interaction', 'timestamp']

    else:
        dtype={0:str, 1:str, 2:float}
        columns = ['userId', 'itemId', 'interaction']

    df_original = pd.read_csv(filepath_or_buffer=filePath, sep=separator, header= 0 if header else None,
                    dtype=dtype, usecols=custom_user_item_rating_columns)

    # If the original file has more columns, keep them but ignore them
    df_original.columns = columns


    user_id_list = df_original['userId'].values
    item_id_list = df_original['itemId'].values
    interaction_list = df_original['interaction'].values

    # Check if duplicates exist
    num_unique_user_item_ids = df_original.drop_duplicates(['userId', 'itemId'], keep='first', inplace=False).shape[0]
    contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list)

    if contains_duplicates_flag:
        if remove_duplicates:
            # # Remove duplicates.

            # This way of removing the duplicates keeping the last tiemstamp without removing other columns
            # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k
            # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax()
            # df_original = df_original.loc[idxs]

            # Alternative faster way:
            # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible
            # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp.

            if timestamp:
                sort_by = ["userId", "itemId", "timestamp"]
            else:
                sort_by = ["userId", "itemId", 'interaction']

            df_original.sort_values(by=sort_by, ascending=True, inplace=True, kind="quicksort", na_position="first")
            df_original.drop_duplicates(["userId", "itemId"], keep='last', inplace=True)

            user_id_list = df_original['userId'].values
            item_id_list = df_original['itemId'].values
            interaction_list = df_original['interaction'].values

            assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

        else:
            assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"




    URM_all_builder.add_data_lists(user_id_list, item_id_list, interaction_list)

    if timestamp:
        timestamp_list = df_original['timestamp'].values
        URM_timestamp_builder.add_data_lists(user_id_list, item_id_list, timestamp_list)

        return  URM_all_builder.get_SparseMatrix(), URM_timestamp_builder.get_SparseMatrix(), \
                URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper()



    return  URM_all_builder.get_SparseMatrix(), \
            URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper()
Ejemplo n.º 21
0
    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadMovieDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/ml-1m/"

            n_users, gender, age, occupation = DatareaderOriginal.load_user_attributes(
                path=path, split=True)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_gender = gender.tocsr()
            UCM_age = age.tocsr()
            UCM_occupation = occupation.tocsr()
            UCM_all = sps.hstack((UCM_gender, UCM_age, UCM_occupation)).tocsr()

            ICM_all = sps.csr_matrix(items_genres_mat)

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))

            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)

            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_gender": UCM_gender,
                "UCM_occupation": UCM_occupation,
                "UCM_age": UCM_age,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
Ejemplo n.º 22
0
def split_train_leave_k_out_user_wise(
    URM, k_out=1, use_validation_set=True, leave_random_out=True
):
    """
    The function splits an URM in two matrices selecting the k_out interactions one user at a time
    :param URM:
    :param k_out:
    :param use_validation_set:
    :param leave_random_out:
    :return:
    """

    assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format(
        k_out
    )

    URM = sps.csr_matrix(URM)
    n_users, n_items = URM.shape

    URM_train_builder = IncrementalSparseMatrix(
        auto_create_row_mapper=False,
        n_rows=n_users,
        auto_create_col_mapper=False,
        n_cols=n_items,
    )

    URM_test_builder = IncrementalSparseMatrix(
        auto_create_row_mapper=False,
        n_rows=n_users,
        auto_create_col_mapper=False,
        n_cols=n_items,
    )

    if use_validation_set:
        URM_validation_builder = IncrementalSparseMatrix(
            auto_create_row_mapper=False,
            n_rows=n_users,
            auto_create_col_mapper=False,
            n_cols=n_items,
        )

    for user_id in range(n_users):

        start_user_position = URM.indptr[user_id]
        end_user_position = URM.indptr[user_id + 1]

        user_profile = URM.indices[start_user_position:end_user_position]

        if leave_random_out:
            indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

            np.random.shuffle(indices_to_suffle)

            user_interaction_items = user_profile[indices_to_suffle]
            user_interaction_data = URM.data[start_user_position:end_user_position][
                indices_to_suffle
            ]

        else:

            # The first will be sampled so the last interaction must be the first one
            interaction_position = URM.data[start_user_position:end_user_position]

            sort_interaction_index = np.argsort(-interaction_position)

            user_interaction_items = user_profile[sort_interaction_index]
            user_interaction_data = URM.data[start_user_position:end_user_position][
                sort_interaction_index
            ]

        # Test interactions
        user_interaction_items_test = user_interaction_items[0:k_out]
        user_interaction_data_test = user_interaction_data[0:k_out]

        URM_test_builder.add_data_lists(
            [user_id] * len(user_interaction_items_test),
            user_interaction_items_test,
            user_interaction_data_test,
        )

        # validation interactions
        if use_validation_set:
            user_interaction_items_validation = user_interaction_items[
                k_out : k_out * 2
            ]
            user_interaction_data_validation = user_interaction_data[k_out : k_out * 2]

            URM_validation_builder.add_data_lists(
                [user_id] * k_out,
                user_interaction_items_validation,
                user_interaction_data_validation,
            )

        # Train interactions
        user_interaction_items_train = user_interaction_items[k_out * 2 :]
        user_interaction_data_train = user_interaction_data[k_out * 2 :]

        URM_train_builder.add_data_lists(
            [user_id] * len(user_interaction_items_train),
            user_interaction_items_train,
            user_interaction_data_train,
        )

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_test = URM_test_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)

    if user_no_item_train != 0:
        print(
            "Warning: {} ({:.2f} %) of {} users have no Train items".format(
                user_no_item_train, user_no_item_train / n_users * 100, n_users
            )
        )

    if use_validation_set:
        URM_validation = URM_validation_builder.get_SparseMatrix()

        URM_validation = sps.csr_matrix(URM_validation)
        user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

        if user_no_item_validation != 0:
            print(
                "Warning: {} ({:.2f} %) of {} users have no Validation items".format(
                    user_no_item_validation,
                    user_no_item_validation / n_users * 100,
                    n_users,
                )
            )

        return URM_train, URM_validation, URM_test

    return URM_train, URM_test
    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadTafengDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/tafeng/"

            n_users, user_attributes_mat = DatareaderOriginal.load_user_attributes(
                path=path)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_all = sps.csc_matrix(user_attributes_mat)
            UCM_age = UCM_all[:, 0:11].tocsr()
            UCM_region = UCM_all[:, 11:19].tocsr()
            UCM_all = UCM_all.tocsr()

            # col: 0->category, 2->asset(0-1), 1->price(0-1)
            ICM_original = sps.csc_matrix(items_genres_mat)

            # category could be used as matrix, not single row
            ICM_sub_class = ICM_original[:, 0:1].tocsr()
            max = ICM_sub_class.shape[0]
            rows, cols, data = [], [], []
            for idx in range(max):
                # we have only index 0 as col
                data_vect = ICM_sub_class.data[
                    ICM_sub_class.indptr[idx]:ICM_sub_class.indptr[idx + 1]]
                if len(data_vect) == 0:
                    # handle category value 0 that in a csr matrix is not present
                    cols.append(int(0))
                else:
                    cols.append(int(data_vect[0]))
                rows.append(idx)
                data.append(1.0)

            ICM_sub_class = sps.csr_matrix((data, (rows, cols)))
            ICM_asset = ICM_original[:, 1:2].tocsr()
            ICM_price = ICM_original[:, 2:3].tocsr()

            ICM_original = ICM_original.tocsc()
            ICM_all = sps.hstack((ICM_sub_class, ICM_asset, ICM_price))

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))
            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)
            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_age": UCM_age,
                "UCM_region": UCM_region,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
                "ICM_original": ICM_original,
                "ICM_sub_class": ICM_sub_class,
                "ICM_asset": ICM_asset,
                "ICM_price": ICM_price,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
Ejemplo n.º 24
0
def split_train_validation_cold_start_user_wise(URM_train,
                                                full_train_percentage=0.0,
                                                cold_items=1,
                                                verbose=True):

    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    # ensure to use csr matrix or we get big problem
    URM_train = URM_train.tocsr()

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                n_cols=num_items)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                     n_cols=num_items)

    user_no_item_train = 0
    user_no_item_validation = 0

    # if we split two time train-test and train-validation we could get users with no items in the second split,
    # in order to get good test with enough non empty users, get the random users within the users with at least <cold_items>
    nnz_per_row = URM_train.getnnz(axis=1)

    users_enough_items = np.where(nnz_per_row > cold_items)[0]
    users_no_enough_items = np.where(nnz_per_row <= cold_items)[0]

    np.random.shuffle(users_enough_items)

    n_train_users = round(len(users_enough_items) * full_train_percentage)

    print("Users enough items: {}".format(len(users_enough_items)))
    print("Users no enough items: {}".format(len(users_no_enough_items)))

    # create full train part without coldstart
    for user_id in np.concatenate(
        (users_enough_items[0:n_train_users], users_no_enough_items), axis=0):
        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]
        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)
        URM_train_builder.add_data_lists([user_id] * user_profile_length,
                                         user_profile_items,
                                         user_profile_ratings)

    # create test + train for the cold start users
    for user_id in users_enough_items[n_train_users:]:

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]

        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = min(cold_items, user_profile_length)

        if n_train_items == len(user_profile_items) and n_train_items > 1:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[
            indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[
            indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[
            indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            user_no_item_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            user_no_item_validation += 1

        URM_train_builder.add_data_lists([user_id] * len(train_items),
                                         train_items, train_ratings)
        URM_validation_builder.add_data_lists(
            [user_id] * len(validation_items), validation_items,
            validation_ratings)

    if user_no_item_train != 0:
        print("Warning split: {} users with 0 train items ({} total users)".
              format(user_no_item_train, URM_train.shape[0]))
    if user_no_item_validation != 0:
        print(
            "Warning split: {} users with 0 validation items ({} total users)".
            format(user_no_item_validation, URM_train.shape[0]))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
Ejemplo n.º 25
0
def load_CSV_into_SparseBuilder(
    filePath,
    header=False,
    separator="::",
    timestamp=False,
    remove_duplicates=False,
    custom_user_item_rating_columns=None,
    create_mapper=True,
    preinitialized_row_mapper=None,
    preinitialized_col_mapper=None,
    on_new_col="add",
    on_new_row="add",
):
    """
    The function loads a CSV file into a URM
    :param filePath:
    :param header:      True/False the file does have a header
    :param separator:
    :param timestamp:   True/False load the timestamp as well
    :param remove_duplicates:   Remove row/column duplicates, if the timestamp is provided it kees the most recent one,
                                otherwise the highest rating or interaction value.
    :param custom_user_item_rating_columns:     Column names for the user_id, item_id and rating value as in the file header
    :param create_mapper:       True map the IDs into a new interger value, False use the original value
    :param preinitialized_row_mapper:      Dictionary {originalID: matrix index}  to translate rowIDs into row indices (e.g., userID into user index)
    :param preinitialized_col_mapper:      Dictionary {originalID: matrix index} to translate rowIDs into row indices (e.g., ItemID into item index)
    :return:
    """

    if preinitialized_row_mapper is not None or preinitialized_col_mapper is not None:
        URM_all_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=preinitialized_col_mapper,
            preinitialized_row_mapper=preinitialized_row_mapper,
            on_new_col=on_new_col,
            on_new_row=on_new_row,
        )
        URM_timestamp_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=preinitialized_col_mapper,
            preinitialized_row_mapper=preinitialized_row_mapper,
            on_new_col=on_new_col,
            on_new_row=on_new_row,
        )

    else:
        URM_all_builder = IncrementalSparseMatrix(
            auto_create_col_mapper=create_mapper,
            auto_create_row_mapper=create_mapper)
        URM_timestamp_builder = IncrementalSparseMatrix(
            auto_create_col_mapper=create_mapper,
            auto_create_row_mapper=create_mapper)

    if timestamp:
        dtype = {0: str, 1: str, 2: float, 3: float}
        columns = ["userId", "itemId", "interaction", "timestamp"]

    else:
        dtype = {0: str, 1: str, 2: float}
        columns = ["userId", "itemId", "interaction"]

    df_original = pd.read_csv(
        filepath_or_buffer=filePath,
        sep=separator,
        header=0 if header else None,
        dtype=dtype,
        usecols=custom_user_item_rating_columns,
    )

    # If the original file has more columns, keep them but ignore them
    df_original.columns = columns

    user_id_list = df_original["userId"].values
    item_id_list = df_original["itemId"].values
    interaction_list = df_original["interaction"].values

    # Check if duplicates exist
    num_unique_user_item_ids = df_original.drop_duplicates(
        ["userId", "itemId"], keep="first", inplace=False).shape[0]
    contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list)

    if contains_duplicates_flag:
        if remove_duplicates:
            # # Remove duplicates.

            # This way of removing the duplicates keeping the last tiemstamp without removing other columns
            # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k
            # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax()
            # df_original = df_original.loc[idxs]

            # Alternative faster way:
            # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible
            # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp.

            if timestamp:
                sort_by = ["userId", "itemId", "timestamp"]
            else:
                sort_by = ["userId", "itemId", "interaction"]

            df_original.sort_values(
                by=sort_by,
                ascending=True,
                inplace=True,
                kind="quicksort",
                na_position="first",
            )
            df_original.drop_duplicates(["userId", "itemId"],
                                        keep="last",
                                        inplace=True)

            user_id_list = df_original["userId"].values
            item_id_list = df_original["itemId"].values
            interaction_list = df_original["interaction"].values

            assert num_unique_user_item_ids == len(
                user_id_list
            ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

        else:
            assert num_unique_user_item_ids == len(
                user_id_list
            ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

    URM_all_builder.add_data_lists(user_id_list, item_id_list,
                                   interaction_list)

    if timestamp:
        timestamp_list = df_original["timestamp"].values
        URM_timestamp_builder.add_data_lists(user_id_list, item_id_list,
                                             timestamp_list)

        return (
            URM_all_builder.get_SparseMatrix(),
            URM_timestamp_builder.get_SparseMatrix(),
            URM_all_builder.get_column_token_to_id_mapper(),
            URM_all_builder.get_row_token_to_id_mapper(),
        )

    return (
        URM_all_builder.get_SparseMatrix(),
        URM_all_builder.get_column_token_to_id_mapper(),
        URM_all_builder.get_row_token_to_id_mapper(),
    )
    def __init__(self, pre_splitted_path):

        super(Movielens1MReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m")

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            URM_train_original = URM_train_original.tocsr()
            URM_test = URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(URM_train_original.shape[0], URM_test.shape[0]),
                     max(URM_train_original.shape[1], URM_test.shape[1]))

            URM_train_original = reshapeSparse(URM_train_original, shape)
            URM_test = reshapeSparse(URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy())

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")
Ejemplo n.º 27
0
    def __init__(self, pre_splitted_path, type="original"):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            if type == "original":

                # Ensure file is loaded as matrix
                Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

                dataset = Dataset_github(
                    "Conferences/IJCAI/DELF_original/Data/ml-1m")

                URM_train, URM_validation, URM_test, testNegatives = dataset.trainMatrix, dataset.validRatings, \
                                                                     dataset.testRatings, dataset.testNegatives

                URM_train = URM_train.tocsr()
                URM_validation = URM_validation.tocsr()
                URM_test = URM_test.tocsr()
                URM_timestamp = "no"

                from Base.Recommender_utils import reshapeSparse

                shape = (max(URM_train.shape[0], URM_validation.shape[0],
                             URM_test.shape[0]),
                         max(URM_train.shape[1], URM_validation.shape[1],
                             URM_test.shape[1]))

                URM_train = reshapeSparse(URM_train, shape)
                URM_validation = reshapeSparse(URM_validation, shape)
                URM_test = reshapeSparse(URM_test, shape)

                URM_test_negatives_builder = IncrementalSparseMatrix(
                    n_rows=shape[0], n_cols=shape[1])

                for user_index in range(len(dataset.testNegatives)):

                    user_test_items = dataset.testNegatives[user_index]

                    URM_test_negatives_builder.add_single_row(user_index,
                                                              user_test_items,
                                                              data=1.0)

                URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
                )

            elif type == "ours":

                # create from full dataset with leave out one time wise from ORIGINAL full dateset
                data_reader = Movielens1MReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_from_name("URM_all")
                URM_timestamp = loaded_dataset.get_URM_from_name(
                    "URM_timestamp")

                # make rating implicit
                URM_all.data = np.ones_like(URM_all.data)

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

            else:
                assert False

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
Ejemplo n.º 28
0
    def __init__(self):

        super(Movielens100KReader, self).__init__()


        pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/"

        pre_splitted_filename = "splitted_data"

        original_data_path = "Conferences/KDD/MCRec_github/data/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens100KReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens100KReader: Pre-splitted data not found, building new one")

            print("Movielens100KReader: loading URM")


            from Conferences.KDD.MCRec_github.code.Dataset import Dataset

            dataset = 'ml-100k'

            dataset = Dataset(original_data_path + dataset)
            URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

            # Dataset adds 1 to user and item id, removing it to restore 0 indexing
            URM_train = sps.coo_matrix(URM_train)
            URM_train.row -= 1
            URM_train.col -= 1

            self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col)))


            num_users, num_items = self.URM_train.shape



            # Build sparse matrices from lists
            URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
            URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)


            for user_index in range(len(testRatings)):

                user_id = testRatings[user_index][0]
                current_user_test_items = testRatings[user_index][1:]
                current_user_test_negative_items = testNegatives[user_index]

                current_user_test_items = np.array(current_user_test_items) -1
                current_user_test_negative_items = np.array(current_user_test_negative_items) -1

                URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0)
                URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0)



            # the test data has repeated data, apparently
            self.URM_test = URM_test_builder.get_SparseMatrix()

            self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix()


            # Split validation from train as 10%
            from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9)


            # Load features

            data_reader = Movielens100KReader_DataManager()
            data_reader.load_data()

            zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER
            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

            ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/")

            ICM_genre = self._loadICM(ICM_path)
            ICM_genre = ICM_genre.get_SparseMatrix()

            shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

            self.ICM_dict = {"ICM_genre": ICM_genre}


            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "URM_test_negative": self.URM_test_negative,
                "ICM_dict": self.ICM_dict,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens100KReader: loading complete")
def split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=100):

    URM_all = sps.csr_matrix(URM_all)
    URM_timestamp = sps.csr_matrix(URM_timestamp)

    n_rows, n_cols = URM_all.shape


    URM_train_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)
    URM_test_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)
    URM_negative_builder = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)

    all_items = np.arange(0, n_cols, dtype=np.int)


    for user_index in range(URM_all.shape[0]):

        if user_index % 10000 == 0:
            print("split_data_on_sequence: user {} of {}".format(user_index, URM_all.shape[0]))

        start_pos = URM_all.indptr[user_index]
        end_pos = URM_all.indptr[user_index+1]

        user_profile = URM_all.indices[start_pos:end_pos]
        user_data = URM_all.data[start_pos:end_pos]
        user_sequence = URM_timestamp.data[start_pos:end_pos]


        unobserved_index = np.in1d(all_items, user_profile, assume_unique=True, invert=True)

        unobserved_items = all_items[unobserved_index]
        np.random.shuffle(unobserved_items)

        URM_negative_builder.add_single_row(user_index, unobserved_items[:negative_items_per_positive], 1.0)


        if len(user_profile) >= 3:



            # Test contain the first one, validation the second
            min_pos = np.argmax(user_sequence)

            venue_index = user_profile[min_pos]
            venue_data = user_data[min_pos]

            URM_test_builder.add_data_lists([user_index], [venue_index], [venue_data])

            user_profile = np.delete(user_profile, min_pos)
            user_data = np.delete(user_data, min_pos)
            user_sequence = np.delete(user_sequence, min_pos)


            min_pos = np.argmax(user_sequence)

            venue_index = user_profile[min_pos]
            venue_data = user_data[min_pos]

            URM_validation_builder.add_data_lists([user_index], [venue_index], [venue_data])

            user_profile = np.delete(user_profile, min_pos)
            user_data = np.delete(user_data, min_pos)
            #user_sequence = np.delete(user_sequence, min_pos)


            URM_train_builder.add_data_lists([user_index]*len(user_profile), user_profile, user_data)


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()
    URM_test = URM_test_builder.get_SparseMatrix()
    URM_negative = URM_negative_builder.get_SparseMatrix()



    return URM_train, URM_validation, URM_test, URM_negative