Ejemplo n.º 1
0
def load_CSV_into_SparseBuilder(filePath, header=False, separator="::"):

    matrixBuilder = IncrementalSparseMatrix(auto_create_col_mapper=True,
                                            auto_create_row_mapper=True)

    fileHandle = open(filePath, "r")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            try:
                user_id = line[0]
                item_id = line[1]

                try:
                    value = float(line[2])

                    if value != 0.0:

                        matrixBuilder.add_data_lists([user_id], [item_id],
                                                     [value])

                except ValueError:
                    print(
                        "load_CSV_into_SparseBuilder: Cannot parse as float value '{}'"
                        .format(line[2]))

            except IndexError:
                print(
                    "load_CSV_into_SparseBuilder: Index out of bound in line '{}'"
                    .format(line))

    fileHandle.close()

    return matrixBuilder.get_SparseMatrix(
    ), matrixBuilder.get_column_token_to_id_mapper(
    ), matrixBuilder.get_row_token_to_id_mapper()
Ejemplo n.º 2
0
    def _loadURM(self):

        from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

        numCells = 0
        URM_builder = IncrementalSparseMatrix(auto_create_col_mapper=True,
                                              auto_create_row_mapper=True)

        for current_split in [1, 2, 3, 4]:

            current_split_path = self.dataFile.extract(
                "combined_data_{}.txt".format(current_split),
                path=self.decompressed_zip_file_folder + "decompressed/")

            fileHandle = open(current_split_path, "r")

            print("NetflixPrizeReader: loading split {}".format(current_split))

            currentMovie_id = None

            for line in fileHandle:

                if numCells % 1000000 == 0 and numCells != 0:
                    print("Processed {} cells".format(numCells))

                if (len(line)) > 1:

                    line_split = line.split(",")

                    # If line has 3 components, it is a 'user_id,rating,date' row
                    if len(line_split) == 3 and currentMovie_id != None:

                        user_id = line_split[0]

                        URM_builder.add_data_lists([user_id],
                                                   [currentMovie_id],
                                                   [float(line_split[1])])

                        numCells += 1

                    # If line has 1 component, it MIGHT be a 'item_id:' row
                    elif len(line_split) == 1:
                        line_split = line.split(":")

                        # Confirm it is a 'item_id:' row
                        if len(line_split) == 2:
                            currentMovie_id = line_split[0]

                        else:
                            print("Unexpected row: '{}'".format(line))

                    else:
                        print("Unexpected row: '{}'".format(line))

            fileHandle.close()

            print("NetflixPrizeReader: cleaning temporary files")

            shutil.rmtree(self.decompressed_zip_file_folder + "decompressed/",
                          ignore_errors=True)

        return URM_builder.get_SparseMatrix(
        ), URM_builder.get_column_token_to_id_mapper(
        ), URM_builder.get_row_token_to_id_mapper()
def load_CSV_into_SparseBuilder (filePath, header = False, separator="::", timestamp = False, remove_duplicates = False,
                                 custom_user_item_rating_columns = None):

    URM_all_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True)
    URM_timestamp_builder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True)

    if timestamp:
        dtype={0:str, 1:str, 2:float, 3:float}
        columns = ['userId', 'itemId', 'interaction', 'timestamp']

    else:
        dtype={0:str, 1:str, 2:float}
        columns = ['userId', 'itemId', 'interaction']

    df_original = pd.read_csv(filepath_or_buffer=filePath, sep=separator, header= 0 if header else None,
                    dtype=dtype, usecols=custom_user_item_rating_columns)

    # If the original file has more columns, keep them but ignore them
    df_original.columns = columns


    user_id_list = df_original['userId'].values
    item_id_list = df_original['itemId'].values
    interaction_list = df_original['interaction'].values

    # Check if duplicates exist
    num_unique_user_item_ids = df_original.drop_duplicates(['userId', 'itemId'], keep='first', inplace=False).shape[0]
    contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list)

    if contains_duplicates_flag:
        if remove_duplicates:
            # # Remove duplicates.

            # This way of removing the duplicates keeping the last tiemstamp without removing other columns
            # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k
            # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax()
            # df_original = df_original.loc[idxs]

            # Alternative faster way:
            # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible
            # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp.

            if timestamp:
                sort_by = ["userId", "itemId", "timestamp"]
            else:
                sort_by = ["userId", "itemId", 'interaction']

            df_original.sort_values(by=sort_by, ascending=True, inplace=True, kind="quicksort", na_position="first")
            df_original.drop_duplicates(["userId", "itemId"], keep='last', inplace=True)

            user_id_list = df_original['userId'].values
            item_id_list = df_original['itemId'].values
            interaction_list = df_original['interaction'].values

            assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

        else:
            assert num_unique_user_item_ids == len(user_id_list), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"




    URM_all_builder.add_data_lists(user_id_list, item_id_list, interaction_list)

    if timestamp:
        timestamp_list = df_original['timestamp'].values
        URM_timestamp_builder.add_data_lists(user_id_list, item_id_list, timestamp_list)

        return  URM_all_builder.get_SparseMatrix(), URM_timestamp_builder.get_SparseMatrix(), \
                URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper()



    return  URM_all_builder.get_SparseMatrix(), \
            URM_all_builder.get_column_token_to_id_mapper(), URM_all_builder.get_row_token_to_id_mapper()