def get_urm_warm_users(self, threshold=1): n_users, n_items = self.urm.shape urm_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for user_id in range(n_users): start_user_position = self.urm.indptr[user_id] end_user_position = self.urm.indptr[user_id + 1] user_profile = self.urm.indices[ start_user_position:end_user_position] if len(user_profile) > threshold: user_interaction_items_train = user_profile user_interaction_data_train = self.urm.data[ start_user_position:end_user_position] urm_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) warm_urm = urm_train_builder.get_SparseMatrix() warm_urm = sps.csr_matrix(warm_urm) user_no_item_train = np.sum(np.ediff1d(warm_urm.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no Train items". format(user_no_item_train, user_no_item_train / n_users * 100, n_users)) return warm_urm
def get_urm_warm_users_items(self, threshold_user=10, threshold_item=10): # Elimino Items users = self.get_raw_users() items = self.get_raw_items() length = items.shape[0] urm_csc = self.urm.tocsc() item_interactions = np.ediff1d(urm_csc.indptr) warm_items = item_interactions > threshold_item new_users = [] new_items = [] for index in np.arange(length): if warm_items[items[index]]: new_users.append(users[index]) new_items.append((items[index])) new_length = len(new_items) urm = sps.coo_matrix((np.ones(new_length), (new_users, new_items))) urm = urm.tocsr() #### Elimino Users n_users, n_items = urm.shape urm_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for user_id in range(n_users): start_user_position = urm.indptr[user_id] end_user_position = urm.indptr[user_id + 1] user_profile = urm.indices[start_user_position:end_user_position] if len(user_profile) > threshold_user: user_interaction_items_train = user_profile user_interaction_data_train = urm.data[ start_user_position:end_user_position] urm_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) warm_urm = urm_train_builder.get_SparseMatrix() warm_urm = sps.csr_matrix(warm_urm) user_no_item_train = np.sum(np.ediff1d(warm_urm.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no Train items". format(user_no_item_train, user_no_item_train / n_users * 100, n_users)) return warm_urm
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1): """ The function splits an URM in two matrices selecting the number of interactions globally :param URM_all: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage) from DataManager.IncrementalSparseMatrix import IncrementalSparseMatrix num_users, num_items = URM_all.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_train = sps.coo_matrix(URM_all) indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int) np.random.shuffle(indices_for_sampling) n_train_interactions = round(URM_all.nnz * train_percentage) indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]] indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]] URM_train_builder.add_data_lists(URM_train.row[indices_for_train], URM_train.col[indices_for_train], URM_train.data[indices_for_train]) URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation], URM_train.col[indices_for_validation], URM_train.data[indices_for_validation]) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) URM_validation = sps.csr_matrix(URM_validation) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users)) if user_no_item_validation != 0: print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users)) return URM_train, URM_validation
def create_weight_age_matrix(self): age_df = pd.read_csv('Data/data_UCM_age.csv') list_user = np.array(age_df['row']) list_age = np.array(age_df['col']) n_user = len(list_user) shape = self.urm.shape[0] weight_matrix_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=shape, auto_create_col_mapper=False, n_cols=shape) for index_1 in tqdm(np.arange(n_user)): user_1 = list_user[index_1] list_weight = np.zeros(n_user) for index_2 in np.arange(n_user): list_weight[index_2] = abs(list_age[index_1] - list_age[index_2]) # weight = self.compute_age_similarity(list_age[index_1], list_age[index_2]) list_weight = list_weight / 10 list_weight = np.negative(list_weight) list_weight = np.exp(list_weight) weight_matrix_builder.add_data_lists([user_1] * len(list_user), list_user, list_weight) weight_matrix = weight_matrix_builder.get_SparseMatrix() weight_matrix = sps.csr_matrix(weight_matrix) return weight_matrix
def split_train_in_two_percentage_user_wise(URM_train, train_percentage = 0.1, verbose = False): """ The function splits an URM in two matrices selecting the number of interactions one user at a time :param URM_train: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage) from DataManager.IncrementalSparseMatrix import IncrementalSparseMatrix # ensure to use csr matrix or we get big problem URM_train = URM_train.tocsr() num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) user_no_item_train = 0 user_no_item_validation = 0 for user_id in range(URM_train.shape[0]): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id+1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = round(user_profile_length*train_percentage) if n_train_items == len(user_profile_items) and n_train_items > 1: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) user_no_item_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) user_no_item_validation += 1 URM_train_builder.add_data_lists([user_id]*len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists([user_id]*len(validation_items), validation_items, validation_ratings) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users)) if user_no_item_validation != 0: print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users)) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def split_train_leave_k_out_user_wise(URM, k_out = 1, use_validation_set = False, leave_random_out = True, threshold=10, temperature="normal"): """ The function splits an URM in two matrices selecting the k_out interactions one user at a time :param temperature: :param threshold: :param URM: :param k_out: :param use_validation_set: :param leave_random_out: :return: """ temperature_values = ["cold", "warm", "zero", "normal", 'valid', 'zero2', 'valid2'] assert temperature in temperature_values, 'temperature must be "cold", "warm", "zero2", "valid", "valid2, "zero" or "normal"' assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format(k_out) URM = sps.csr_matrix(URM) n_users, n_items = URM.shape URM_train_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users, auto_create_col_mapper=False, n_cols = n_items) URM_test_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users, auto_create_col_mapper=False, n_cols = n_items) if use_validation_set: URM_validation_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows = n_users, auto_create_col_mapper=False, n_cols = n_items) for user_id in range(n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id+1] user_profile = URM.indices[start_user_position:end_user_position] if temperature == "cold": if len(user_profile) <= threshold: if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions user_interaction_items_train = user_interaction_items[k_out:] user_interaction_data_train = user_interaction_data[k_out:] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) else: user_interaction_items_train = user_profile user_interaction_data_train = URM.data[start_user_position:end_user_position] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'warm': if len(user_profile) > threshold: if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions user_interaction_items_train = user_interaction_items[k_out:] user_interaction_data_train = user_interaction_data[k_out:] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) else: user_interaction_items_train = user_profile user_interaction_data_train = URM.data[start_user_position:end_user_position] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'zero': choice = random.choice([True, False]) if 0 < len(user_profile) < 3 and choice: if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # # Train interactions # user_interaction_items_train = user_interaction_items[k_out:] # user_interaction_data_train = user_interaction_data[k_out:] # # URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), # user_interaction_items_train, user_interaction_data_train) else: user_interaction_items_train = user_profile user_interaction_data_train = URM.data[start_user_position:end_user_position] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'zero2': if 0 < len(user_profile) < 3: if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions # user_interaction_items_train = user_interaction_items[k_out:] # user_interaction_data_train = user_interaction_data[k_out:] # # URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), # user_interaction_items_train, user_interaction_data_train) else: user_interaction_items_train = user_profile user_interaction_data_train = URM.data[start_user_position:end_user_position] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'normal': if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions # if len(user_profile) < 3: # k_out = 1 # else: # k_out = int(0.2*len(user_profile)) user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions user_interaction_items_train = user_interaction_items[k_out:] user_interaction_data_train = user_interaction_data[k_out:] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'valid': if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions if len(user_profile) < 3: k_out = 1 else: k_out = int(0.2*len(user_profile)) user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions user_interaction_items_train = user_interaction_items[k_out:] user_interaction_data_train = user_interaction_data[k_out:] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) if temperature == 'valid2': if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[start_user_position:end_user_position][sort_interaction_index] # Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists([user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) # Train interactions user_interaction_items_train = user_interaction_items[k_out:] user_interaction_data_train = user_interaction_data[k_out:] URM_train_builder.add_data_lists([user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train = URM_train_builder.get_SparseMatrix() URM_test = URM_test_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no Train items".format(user_no_item_train, user_no_item_train/n_users*100, n_users)) if use_validation_set: URM_validation = URM_validation_builder.get_SparseMatrix() URM_validation = sps.csr_matrix(URM_validation) user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0) if user_no_item_validation != 0: print("Warning: {} ({:.2f} %) of {} users have no Validation items".format(user_no_item_validation, user_no_item_validation/n_users*100, n_users)) return URM_train, URM_validation, URM_test return URM_train, URM_test