예제 #1
0
    def get_urm_warm_users(self, threshold=1):

        n_users, n_items = self.urm.shape

        urm_train_builder = IncrementalSparseMatrix(
            auto_create_row_mapper=False,
            n_rows=n_users,
            auto_create_col_mapper=False,
            n_cols=n_items)

        for user_id in range(n_users):
            start_user_position = self.urm.indptr[user_id]
            end_user_position = self.urm.indptr[user_id + 1]

            user_profile = self.urm.indices[
                start_user_position:end_user_position]

            if len(user_profile) > threshold:
                user_interaction_items_train = user_profile
                user_interaction_data_train = self.urm.data[
                    start_user_position:end_user_position]

                urm_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

        warm_urm = urm_train_builder.get_SparseMatrix()
        warm_urm = sps.csr_matrix(warm_urm)
        user_no_item_train = np.sum(np.ediff1d(warm_urm.indptr) == 0)

        if user_no_item_train != 0:
            print("Warning: {} ({:.2f} %) of {} users have no Train items".
                  format(user_no_item_train,
                         user_no_item_train / n_users * 100, n_users))
        return warm_urm
예제 #2
0
    def get_urm_warm_users_items(self, threshold_user=10, threshold_item=10):

        # Elimino Items

        users = self.get_raw_users()
        items = self.get_raw_items()
        length = items.shape[0]

        urm_csc = self.urm.tocsc()

        item_interactions = np.ediff1d(urm_csc.indptr)
        warm_items = item_interactions > threshold_item
        new_users = []
        new_items = []
        for index in np.arange(length):
            if warm_items[items[index]]:
                new_users.append(users[index])
                new_items.append((items[index]))

        new_length = len(new_items)

        urm = sps.coo_matrix((np.ones(new_length), (new_users, new_items)))
        urm = urm.tocsr()

        #### Elimino Users

        n_users, n_items = urm.shape

        urm_train_builder = IncrementalSparseMatrix(
            auto_create_row_mapper=False,
            n_rows=n_users,
            auto_create_col_mapper=False,
            n_cols=n_items)

        for user_id in range(n_users):
            start_user_position = urm.indptr[user_id]
            end_user_position = urm.indptr[user_id + 1]

            user_profile = urm.indices[start_user_position:end_user_position]

            if len(user_profile) > threshold_user:
                user_interaction_items_train = user_profile
                user_interaction_data_train = urm.data[
                    start_user_position:end_user_position]

                urm_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

        warm_urm = urm_train_builder.get_SparseMatrix()
        warm_urm = sps.csr_matrix(warm_urm)
        user_no_item_train = np.sum(np.ediff1d(warm_urm.indptr) == 0)

        if user_no_item_train != 0:
            print("Warning: {} ({:.2f} %) of {} users have no Train items".
                  format(user_no_item_train,
                         user_no_item_train / n_users * 100, n_users))

        return warm_urm
예제 #3
0
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1):
    """
    The function splits an URM in two matrices selecting the number of interactions globally
    :param URM_all:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage)


    from DataManager.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_all.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)


    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int)
    np.random.shuffle(indices_for_sampling)

    n_train_interactions = round(URM_all.nnz * train_percentage)

    indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]]
    indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]]


    URM_train_builder.add_data_lists(URM_train.row[indices_for_train],
                                     URM_train.col[indices_for_train],
                                     URM_train.data[indices_for_train])

    URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation],
                                          URM_train.col[indices_for_validation],
                                          URM_train.data[indices_for_validation])


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    URM_validation = sps.csr_matrix(URM_validation)

    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)
    user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users))
    if user_no_item_validation != 0:
        print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users))


    return URM_train, URM_validation
예제 #4
0
    def create_weight_age_matrix(self):
        age_df = pd.read_csv('Data/data_UCM_age.csv')
        list_user = np.array(age_df['row'])
        list_age = np.array(age_df['col'])

        n_user = len(list_user)

        shape = self.urm.shape[0]

        weight_matrix_builder = IncrementalSparseMatrix(
            auto_create_row_mapper=False,
            n_rows=shape,
            auto_create_col_mapper=False,
            n_cols=shape)

        for index_1 in tqdm(np.arange(n_user)):

            user_1 = list_user[index_1]

            list_weight = np.zeros(n_user)
            for index_2 in np.arange(n_user):
                list_weight[index_2] = abs(list_age[index_1] -
                                           list_age[index_2])
                # weight = self.compute_age_similarity(list_age[index_1], list_age[index_2])

            list_weight = list_weight / 10
            list_weight = np.negative(list_weight)
            list_weight = np.exp(list_weight)

        weight_matrix_builder.add_data_lists([user_1] * len(list_user),
                                             list_user, list_weight)

        weight_matrix = weight_matrix_builder.get_SparseMatrix()

        weight_matrix = sps.csr_matrix(weight_matrix)

        return weight_matrix
예제 #5
0
def split_train_in_two_percentage_user_wise(URM_train, train_percentage = 0.1, verbose = False):
    """
    The function splits an URM in two matrices selecting the number of interactions one user at a time
    :param URM_train:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage)

    from DataManager.IncrementalSparseMatrix import IncrementalSparseMatrix

    # ensure to use csr matrix or we get big problem
    URM_train = URM_train.tocsr()


    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)

    user_no_item_train = 0
    user_no_item_validation = 0

    for user_id in range(URM_train.shape[0]):

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id+1]


        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = round(user_profile_length*train_percentage)

        if n_train_items == len(user_profile_items) and n_train_items > 1:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            user_no_item_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            user_no_item_validation += 1


        URM_train_builder.add_data_lists([user_id]*len(train_items), train_items, train_ratings)
        URM_validation_builder.add_data_lists([user_id]*len(validation_items), validation_items, validation_ratings)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users))
    if user_no_item_validation != 0:
        print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()


    return URM_train, URM_validation
def split_train_leave_k_out_user_wise(URM, k_out = 1, use_validation_set = False, leave_random_out = True, threshold=10, temperature="normal"):
    """
    The function splits an URM in two matrices selecting the k_out interactions one user at a time
    :param temperature:
    :param threshold:
    :param URM:
    :param k_out:
    :param use_validation_set:
    :param leave_random_out:
    :return:
    """

    temperature_values = ["cold", "warm", "zero", "normal", 'valid', 'zero2', 'valid2']

    assert temperature in temperature_values, 'temperature must be "cold", "warm", "zero2", "valid", "valid2, "zero" or "normal"'
    assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format(k_out)

    URM = sps.csr_matrix(URM)
    n_users, n_items = URM.shape


    URM_train_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users,
                                        auto_create_col_mapper=False, n_cols = n_items)

    URM_test_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users,
                                        auto_create_col_mapper=False, n_cols = n_items)

    if use_validation_set:
         URM_validation_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users,
                                                          auto_create_col_mapper=False, n_cols = n_items)



    for user_id in range(n_users):

        start_user_position = URM.indptr[user_id]
        end_user_position = URM.indptr[user_id+1]

        user_profile = URM.indices[start_user_position:end_user_position]
        if temperature == "cold":
            if len(user_profile) <= threshold:
                if leave_random_out:
                    indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                    np.random.shuffle(indices_to_suffle)

                    user_interaction_items = user_profile[indices_to_suffle]
                    user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

                else:

                    # The first will be sampled so the last interaction must be the first one
                    interaction_position = URM.data[start_user_position:end_user_position]

                    sort_interaction_index = np.argsort(-interaction_position)

                    user_interaction_items = user_profile[sort_interaction_index]
                    user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions
                user_interaction_items_test = user_interaction_items[0:k_out]
                user_interaction_data_test = user_interaction_data[0:k_out]

                URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test),
                                                user_interaction_items_test,
                                                user_interaction_data_test)

                # Train interactions
                user_interaction_items_train = user_interaction_items[k_out:]
                user_interaction_data_train = user_interaction_data[k_out:]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)
            else:
                user_interaction_items_train = user_profile
                user_interaction_data_train = URM.data[start_user_position:end_user_position]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)

        if temperature == 'warm':
            if len(user_profile) > threshold:
                if leave_random_out:
                    indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                    np.random.shuffle(indices_to_suffle)

                    user_interaction_items = user_profile[indices_to_suffle]
                    user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

                else:

                    # The first will be sampled so the last interaction must be the first one
                    interaction_position = URM.data[start_user_position:end_user_position]

                    sort_interaction_index = np.argsort(-interaction_position)

                    user_interaction_items = user_profile[sort_interaction_index]
                    user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions
                user_interaction_items_test = user_interaction_items[0:k_out]
                user_interaction_data_test = user_interaction_data[0:k_out]

                URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test),
                                                user_interaction_items_test,
                                                user_interaction_data_test)

                # Train interactions
                user_interaction_items_train = user_interaction_items[k_out:]
                user_interaction_data_train = user_interaction_data[k_out:]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)
            else:
                user_interaction_items_train = user_profile
                user_interaction_data_train = URM.data[start_user_position:end_user_position]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)

        if temperature == 'zero':
            choice = random.choice([True, False])
            if 0 < len(user_profile) < 3 and choice:
                if leave_random_out:
                    indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                    np.random.shuffle(indices_to_suffle)

                    user_interaction_items = user_profile[indices_to_suffle]
                    user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

                else:

                    # The first will be sampled so the last interaction must be the first one
                    interaction_position = URM.data[start_user_position:end_user_position]

                    sort_interaction_index = np.argsort(-interaction_position)

                    user_interaction_items = user_profile[sort_interaction_index]
                    user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions
                user_interaction_items_test = user_interaction_items[0:k_out]
                user_interaction_data_test = user_interaction_data[0:k_out]

                URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test),
                                                user_interaction_items_test,
                                                user_interaction_data_test)



                # # Train interactions
                # user_interaction_items_train = user_interaction_items[k_out:]
                # user_interaction_data_train = user_interaction_data[k_out:]
                #
                # URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                #                                  user_interaction_items_train, user_interaction_data_train)
            else:
                user_interaction_items_train = user_profile
                user_interaction_data_train = URM.data[start_user_position:end_user_position]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)


        if temperature == 'zero2':

            if 0 < len(user_profile) < 3:
                if leave_random_out:
                    indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                    np.random.shuffle(indices_to_suffle)

                    user_interaction_items = user_profile[indices_to_suffle]
                    user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

                else:

                    # The first will be sampled so the last interaction must be the first one
                    interaction_position = URM.data[start_user_position:end_user_position]

                    sort_interaction_index = np.argsort(-interaction_position)

                    user_interaction_items = user_profile[sort_interaction_index]
                    user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions
                user_interaction_items_test = user_interaction_items[0:k_out]
                user_interaction_data_test = user_interaction_data[0:k_out]

                URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test),
                                                user_interaction_items_test,
                                                user_interaction_data_test)



                # Train interactions
                # user_interaction_items_train = user_interaction_items[k_out:]
                # user_interaction_data_train = user_interaction_data[k_out:]
                #
                # URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                #                                  user_interaction_items_train, user_interaction_data_train)
            else:
                user_interaction_items_train = user_profile
                user_interaction_data_train = URM.data[start_user_position:end_user_position]

                URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                                 user_interaction_items_train, user_interaction_data_train)

        if temperature == 'normal':


            if leave_random_out:
                indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                np.random.shuffle(indices_to_suffle)

                user_interaction_items = user_profile[indices_to_suffle]
                user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

            else:

                # The first will be sampled so the last interaction must be the first one
                interaction_position = URM.data[start_user_position:end_user_position]

                sort_interaction_index = np.argsort(-interaction_position)

                user_interaction_items = user_profile[sort_interaction_index]
                user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions

            # if len(user_profile) < 3:
            #     k_out = 1
            # else:
            #     k_out = int(0.2*len(user_profile))

            user_interaction_items_test = user_interaction_items[0:k_out]
            user_interaction_data_test = user_interaction_data[0:k_out]

            URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test,
                                            user_interaction_data_test)

            # Train interactions
            user_interaction_items_train = user_interaction_items[k_out:]
            user_interaction_data_train = user_interaction_data[k_out:]

            URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                             user_interaction_items_train, user_interaction_data_train)

        if temperature == 'valid':



            if leave_random_out:
                indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                np.random.shuffle(indices_to_suffle)

                user_interaction_items = user_profile[indices_to_suffle]
                user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

            else:

                # The first will be sampled so the last interaction must be the first one
                interaction_position = URM.data[start_user_position:end_user_position]

                sort_interaction_index = np.argsort(-interaction_position)

                user_interaction_items = user_profile[sort_interaction_index]
                user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions

            if len(user_profile) < 3:
                k_out = 1
            else:
                k_out = int(0.2*len(user_profile))

            user_interaction_items_test = user_interaction_items[0:k_out]
            user_interaction_data_test = user_interaction_data[0:k_out]

            URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test,
                                            user_interaction_data_test)

            # Train interactions
            user_interaction_items_train = user_interaction_items[k_out:]
            user_interaction_data_train = user_interaction_data[k_out:]

            URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                             user_interaction_items_train, user_interaction_data_train)


        if temperature == 'valid2':

            if leave_random_out:
                indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

                np.random.shuffle(indices_to_suffle)

                user_interaction_items = user_profile[indices_to_suffle]
                user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle]

            else:

                # The first will be sampled so the last interaction must be the first one
                interaction_position = URM.data[start_user_position:end_user_position]

                sort_interaction_index = np.argsort(-interaction_position)

                user_interaction_items = user_profile[sort_interaction_index]
                user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index]

                # Test interactions

            user_interaction_items_test = user_interaction_items[0:k_out]
            user_interaction_data_test = user_interaction_data[0:k_out]

            URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test,
                                            user_interaction_data_test)

            # Train interactions
            user_interaction_items_train = user_interaction_items[k_out:]
            user_interaction_data_train = user_interaction_data[k_out:]

            URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train),
                                             user_interaction_items_train, user_interaction_data_train)


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_test = URM_test_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no Train items".format(user_no_item_train, user_no_item_train/n_users*100, n_users))



    if use_validation_set:
        URM_validation = URM_validation_builder.get_SparseMatrix()

        URM_validation = sps.csr_matrix(URM_validation)
        user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

        if user_no_item_validation != 0:
            print("Warning: {} ({:.2f} %) of {} users have no Validation items".format(user_no_item_validation, user_no_item_validation/n_users*100, n_users))


        return URM_train, URM_validation, URM_test


    return URM_train, URM_test