Ejemplo n.º 1
0
def _loadICM_tags(tags_path,
                  header=True,
                  separator=',',
                  if_new_item="ignore",
                  item_original_ID_to_index=None,
                  preinitialized_col_mapper=None):

    # Tags
    from Data_manager.TagPreprocessing import tagFilterAndStemming

    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

    ICM_builder = IncrementalSparseMatrix_FilterIDs(
        preinitialized_col_mapper=preinitialized_col_mapper,
        on_new_col="add",
        preinitialized_row_mapper=item_original_ID_to_index,
        on_new_row=if_new_item)

    fileHandle = open(tags_path, "r", encoding="latin1")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 100000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            # If a movie has no genre, ignore it
            movie_id = line[1]

            tagList = line[2]

            # Remove non alphabetical character and split on spaces
            tagList = tagFilterAndStemming(tagList)

            # Rows movie ID
            # Cols features
            ICM_builder.add_single_row(movie_id, tagList, data=1.0)

    fileHandle.close()

    return ICM_builder.get_SparseMatrix(
    ), ICM_builder.get_column_token_to_id_mapper(
    ), ICM_builder.get_row_token_to_id_mapper()
Ejemplo n.º 2
0
def _loadURM_preinitialized_item_id(filePath,
                                    header=False,
                                    separator="::",
                                    if_new_user="******",
                                    if_new_item="ignore",
                                    item_original_ID_to_index=None,
                                    user_original_ID_to_index=None):

    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

    URM_builder = IncrementalSparseMatrix_FilterIDs(
        preinitialized_col_mapper=item_original_ID_to_index,
        on_new_col=if_new_item,
        preinitialized_row_mapper=user_original_ID_to_index,
        on_new_row=if_new_user)

    fileHandle = open(filePath, "r")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

        user_id = line[0]
        item_id = line[1]

        try:
            value = float(line[2])

            if value != 0.0:

                URM_builder.add_data_lists([user_id], [item_id], [value])

        except:
            pass

    fileHandle.close()

    return URM_builder.get_SparseMatrix(
    ), URM_builder.get_column_token_to_id_mapper(
    ), URM_builder.get_row_token_to_id_mapper()
def _loadURM_preinitialized_item_id(URM_path,
                                    header=False,
                                    separator=",",
                                    if_new_user="******",
                                    if_new_item="add",
                                    item_original_ID_to_index=None,
                                    user_original_ID_to_index=None):
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

    URM_all_builder = IncrementalSparseMatrix_FilterIDs(
        preinitialized_col_mapper=item_original_ID_to_index,
        on_new_col=if_new_item,
        preinitialized_row_mapper=user_original_ID_to_index,
        on_new_row=if_new_user)

    if header:
        df_original = pd.read_csv(filepath_or_buffer=URM_path,
                                  sep=separator,
                                  header=0 if header else None,
                                  usecols=['user', 'item', 'rating'],
                                  dtype={
                                      'user': str,
                                      'item': str,
                                      'rating': float
                                  })
    else:
        df_original = pd.read_csv(filepath_or_buffer=URM_path,
                                  sep=separator,
                                  header=0 if header else None,
                                  dtype={
                                      0: str,
                                      1: str,
                                      2: float
                                  })

        df_original.columns = ['user', 'item', 'rating']

    # Remove data with rating non valid
    # df_original.drop(df_original[df_original.rating == 0.0].index, inplace=True)

    user_id_list = df_original['user'].values
    item_id_list = df_original['item'].values
    rating_list = df_original['rating'].values

    URM_all_builder.add_data_lists(user_id_list, item_id_list, rating_list)

    return URM_all_builder.get_SparseMatrix(), \
           URM_all_builder.get_column_token_to_id_mapper(), \
           URM_all_builder.get_row_token_to_id_mapper()
Ejemplo n.º 4
0
def _loadICM_genres(genres_path,
                    header=True,
                    separator=',',
                    genresSeparator="|"):

    # Genres
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

    ICM_builder = IncrementalSparseMatrix_FilterIDs(
        preinitialized_col_mapper=None,
        on_new_col="add",
        preinitialized_row_mapper=None,
        on_new_row="add")

    fileHandle = open(genres_path, "r", encoding="latin1")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            movie_id = line[0]

            title = line[1]
            # In case the title contains commas, it is enclosed in "..."
            # genre list will always be the last element
            genreList = line[-1]

            genreList = genreList.split(genresSeparator)

            # Rows movie ID
            # Cols features
            ICM_builder.add_single_row(movie_id, genreList, data=1.0)

    fileHandle.close()

    return ICM_builder.get_SparseMatrix(
    ), ICM_builder.get_column_token_to_id_mapper(
    ), ICM_builder.get_row_token_to_id_mapper()
Ejemplo n.º 5
0
    def _loadURM(self,
                 file_name,
                 header=False,
                 separator=" ",
                 item_original_ID_to_index=None,
                 user_original_ID_to_index=None):

        URM_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=item_original_ID_to_index,
            on_new_col="add",
            preinitialized_row_mapper=user_original_ID_to_index,
            on_new_row="add")

        fileHandle = open(file_name, "r")
        numCells = 0

        if header:
            fileHandle.readline()

        for line in fileHandle:

            if (numCells % 100000 == 0 and numCells != 0):
                print("Processed {} cells".format(numCells))

            line = line.split(separator)
            if (len(line)) > 1:
                if line[0] == '-1':
                    numCells += 1
                    continue
                elif line[0] == '1':
                    item = int(line[2].split(':')[0])
                    user = int(line[1].split(':')[0])
                    value = 1.0
                else:
                    print('ERROR READING DATASET')
                    break

            numCells += 1

            URM_builder.add_data_lists([user], [item], [value])

        fileHandle.close()

        return URM_builder.get_SparseMatrix(
        ), URM_builder.get_column_token_to_id_mapper(
        ), URM_builder.get_row_token_to_id_mapper()
Ejemplo n.º 6
0
def _loadUCM(UCM_path, header=True, separator=','):

    # Genres
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

    ICM_builder = IncrementalSparseMatrix_FilterIDs(
        preinitialized_col_mapper=None,
        on_new_col="add",
        preinitialized_row_mapper=None,
        on_new_row="add")

    fileHandle = open(UCM_path, "r", encoding="latin1")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} rows".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            user_id = line[0]

            token_list = []
            token_list.append("gender_" + str(line[1]))
            token_list.append("age_group_" + str(line[2]))
            token_list.append("occupation_" + str(line[3]))
            token_list.append("zip_code_" + str(line[4]))

            # Rows movie ID
            # Cols features
            ICM_builder.add_single_row(user_id, token_list, data=1.0)

    fileHandle.close()

    return ICM_builder.get_SparseMatrix(
    ), ICM_builder.get_column_token_to_id_mapper(
    ), ICM_builder.get_row_token_to_id_mapper()
    def _loadReviews(self, file_path, if_new_item="add"):

        from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

        ICM_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=None,
            on_new_col="add",
            preinitialized_row_mapper=self.item_original_ID_to_index,
            on_new_row=if_new_item)

        from Data_manager.TagPreprocessing import tagFilterAndStemming, tagFilter

        parser_reviews = parse_json(file_path)

        numReviewParsed = 0

        for newReview in parser_reviews:

            numReviewParsed += 1
            if (numReviewParsed % 20000 == 0):
                print("Processed {} reviews".format(numReviewParsed))

            user_ID = newReview["reviewerID"]
            item_ID = newReview["asin"]

            reviewText = newReview["reviewText"]
            reviewSummary = newReview["summary"]

            tagList = ' '.join([reviewText, reviewSummary])

            # Remove non alphabetical character and split on spaces
            tagList = tagFilterAndStemming(tagList)

            ICM_builder.add_single_row(item_ID, tagList, data=1.0)

        return ICM_builder.get_SparseMatrix(
        ), ICM_builder.get_column_token_to_id_mapper(
        ), ICM_builder.get_row_token_to_id_mapper()
Ejemplo n.º 8
0
def load_CSV_into_SparseBuilder(
    filePath,
    header=False,
    separator="::",
    timestamp=False,
    remove_duplicates=False,
    custom_user_item_rating_columns=None,
    create_mapper=True,
    preinitialized_row_mapper=None,
    preinitialized_col_mapper=None,
    on_new_col="add",
    on_new_row="add",
):
    """
    The function loads a CSV file into a URM
    :param filePath:
    :param header:      True/False the file does have a header
    :param separator:
    :param timestamp:   True/False load the timestamp as well
    :param remove_duplicates:   Remove row/column duplicates, if the timestamp is provided it kees the most recent one,
                                otherwise the highest rating or interaction value.
    :param custom_user_item_rating_columns:     Column names for the user_id, item_id and rating value as in the file header
    :param create_mapper:       True map the IDs into a new interger value, False use the original value
    :param preinitialized_row_mapper:      Dictionary {originalID: matrix index}  to translate rowIDs into row indices (e.g., userID into user index)
    :param preinitialized_col_mapper:      Dictionary {originalID: matrix index} to translate rowIDs into row indices (e.g., ItemID into item index)
    :return:
    """

    if preinitialized_row_mapper is not None or preinitialized_col_mapper is not None:
        URM_all_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=preinitialized_col_mapper,
            preinitialized_row_mapper=preinitialized_row_mapper,
            on_new_col=on_new_col,
            on_new_row=on_new_row,
        )
        URM_timestamp_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=preinitialized_col_mapper,
            preinitialized_row_mapper=preinitialized_row_mapper,
            on_new_col=on_new_col,
            on_new_row=on_new_row,
        )

    else:
        URM_all_builder = IncrementalSparseMatrix(
            auto_create_col_mapper=create_mapper,
            auto_create_row_mapper=create_mapper)
        URM_timestamp_builder = IncrementalSparseMatrix(
            auto_create_col_mapper=create_mapper,
            auto_create_row_mapper=create_mapper)

    if timestamp:
        dtype = {0: str, 1: str, 2: float, 3: float}
        columns = ["userId", "itemId", "interaction", "timestamp"]

    else:
        dtype = {0: str, 1: str, 2: float}
        columns = ["userId", "itemId", "interaction"]

    df_original = pd.read_csv(
        filepath_or_buffer=filePath,
        sep=separator,
        header=0 if header else None,
        dtype=dtype,
        usecols=custom_user_item_rating_columns,
    )

    # If the original file has more columns, keep them but ignore them
    df_original.columns = columns

    user_id_list = df_original["userId"].values
    item_id_list = df_original["itemId"].values
    interaction_list = df_original["interaction"].values

    # Check if duplicates exist
    num_unique_user_item_ids = df_original.drop_duplicates(
        ["userId", "itemId"], keep="first", inplace=False).shape[0]
    contains_duplicates_flag = num_unique_user_item_ids != len(user_id_list)

    if contains_duplicates_flag:
        if remove_duplicates:
            # # Remove duplicates.

            # This way of removing the duplicates keeping the last tiemstamp without removing other columns
            # would be the simplest, but it is so slow to the point of being unusable on any dataset but ML100k
            # idxs = df_original.groupby(by=['userId', 'itemId'], as_index=False)["timestamp"].idxmax()
            # df_original = df_original.loc[idxs]

            # Alternative faster way:
            # 1 - Sort in ascending order so that the last (bigger) timestamp is in the last position. Set Nan to be in the first position, to remove them if possible
            # 2 - Then remove duplicates for user-item keeping the last row, which will be the last timestamp.

            if timestamp:
                sort_by = ["userId", "itemId", "timestamp"]
            else:
                sort_by = ["userId", "itemId", "interaction"]

            df_original.sort_values(
                by=sort_by,
                ascending=True,
                inplace=True,
                kind="quicksort",
                na_position="first",
            )
            df_original.drop_duplicates(["userId", "itemId"],
                                        keep="last",
                                        inplace=True)

            user_id_list = df_original["userId"].values
            item_id_list = df_original["itemId"].values
            interaction_list = df_original["interaction"].values

            assert num_unique_user_item_ids == len(
                user_id_list
            ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

        else:
            assert num_unique_user_item_ids == len(
                user_id_list
            ), "load_CSV_into_SparseBuilder: duplicate (user, item) values found"

    URM_all_builder.add_data_lists(user_id_list, item_id_list,
                                   interaction_list)

    if timestamp:
        timestamp_list = df_original["timestamp"].values
        URM_timestamp_builder.add_data_lists(user_id_list, item_id_list,
                                             timestamp_list)

        return (
            URM_all_builder.get_SparseMatrix(),
            URM_timestamp_builder.get_SparseMatrix(),
            URM_all_builder.get_column_token_to_id_mapper(),
            URM_all_builder.get_row_token_to_id_mapper(),
        )

    return (
        URM_all_builder.get_SparseMatrix(),
        URM_all_builder.get_column_token_to_id_mapper(),
        URM_all_builder.get_row_token_to_id_mapper(),
    )
    def _loadMetadata(self, file_path, if_new_item="ignore"):

        from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix_FilterIDs

        ICM_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=None,
            on_new_col="add",
            preinitialized_row_mapper=self.item_original_ID_to_index,
            on_new_row=if_new_item)

        from Data_manager.TagPreprocessing import tagFilterAndStemming, tagFilter
        import itertools

        parser_metadata = parse_json(file_path)

        numMetadataParsed = 0

        for newMetadata in parser_metadata:

            numMetadataParsed += 1
            if (numMetadataParsed % 20000 == 0):
                print("Processed {}".format(numMetadataParsed))

            item_ID = newMetadata["asin"]

            # The file might contain other elements, restrict to
            # Those in the URM

            tokenList = []

            #item_price = newMetadata["price"]

            if "title" in newMetadata:
                item_name = newMetadata["title"]
                tokenList.append(item_name)

            # Sometimes brand is not present
            if "brand" in newMetadata:
                item_brand = newMetadata["brand"]
                tokenList.append(item_brand)

            # Categories are a list of lists. Unclear whether only the first element contains data or not
            if "categories" in newMetadata:
                item_categories = newMetadata["categories"]
                item_categories = list(
                    itertools.chain.from_iterable(item_categories))
                tokenList.extend(item_categories)

            if "description" in newMetadata:
                item_description = newMetadata["description"]
                tokenList.append(item_description)

            tokenList = ' '.join(tokenList)

            # Remove non alphabetical character and split on spaces
            tokenList = tagFilterAndStemming(tokenList)

            # Remove duplicates
            tokenList = list(set(tokenList))

            ICM_builder.add_single_row(item_ID, tokenList, data=1.0)

        return ICM_builder.get_SparseMatrix(
        ), ICM_builder.get_column_token_to_id_mapper(
        ), ICM_builder.get_row_token_to_id_mapper()