def build_similarity_matrix(self, df_responses, agg_strategy, filter_sample_method, mapping_matrix): """ It builds the similarity matrix by using a dataframe with all the samples collected from the solver in the fit function. The samples obtained from the solver are post-processed with a filtering operation (i.e. filter_strategy) and an aggregation operation (i.e. agg_strategy). At the end of this pipeline, it outputs a single list containing a column of the similarity matrix. :param df_responses: a dataframe containing the samples collected from the solver :param agg_strategy: the post-processing aggregation to be used on the samples :param filter_sample_method: the filter technique used before the post-processing aggregation :param mapping_matrix: list of np.ndarray containing the mapping of the samples variables into the original variables for each item problem :return: the similarity matrix built from the dataframe given """ n_items = self.URM_train.shape[1] if mapping_matrix is None: mapping_matrix = np.repeat(np.reshape(np.arange(0, n_items), newshape=(1, n_items)), repeats=n_items, axis=0) matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items) for currentItem in range(n_items): response_df = df_responses[df_responses.item_id == currentItem].copy() self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method, response_df, currentItem, mapping_matrix[currentItem]) return sps.csr_matrix(matrix_builder.get_SparseMatrix())
def apply_feature_engineering_ICM(ICM_dict: dict, URM, UCM_dict: dict, ICM_names_to_count: list, UCM_names_to_list: list): if ~np.all(np.in1d(list(ICM_names_to_count), list(ICM_dict.keys()))): raise KeyError("Mapper contains wrong ICM names") if ~np.all(np.in1d(UCM_names_to_list, list(UCM_dict.keys()))): raise KeyError("Mapper contains wrong UCM names") for ICM_name in ICM_names_to_count: ICM_object: sps.csr_matrix = ICM_dict[ICM_name] column = ICM_object.tocoo().col uniques, counts = np.unique(column, return_counts=True) new_ICM_name = "{}_count".format(ICM_name) new_row = np.array(ICM_object.tocoo().row, dtype=int) new_col = np.array([0] * len(new_row), dtype=int) new_data = np.array(counts[column], dtype=np.float32) ICM_builder = IncrementalSparseMatrix() ICM_builder.add_data_lists(new_row, new_col, new_data) ICM_dict[new_ICM_name] = ICM_builder.get_SparseMatrix() for UCM_name in UCM_names_to_list: UCM_object = UCM_dict[UCM_name] UCM_suffix_name = UCM_name.replace("UCM", "") new_ICM = URM.T.dot(UCM_object) new_ICM_name = "ICM{}".format(UCM_suffix_name) ICM_dict[new_ICM_name] = new_ICM.tocsr() return ICM_dict
def test_IncrementalSparseMatrix_add_rows(self): import numpy as np n_rows = 100 n_cols = 200 randomMatrix = sps.random(n_rows, n_cols, density=0.01, format='csr') incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) for row in range(n_rows): row_data = randomMatrix.indices[randomMatrix.indptr[row]:randomMatrix.indptr[row+1]] incrementalMatrix.add_single_row(row, row_data, 5.0) randomMatrix.data = np.ones_like(randomMatrix.data)*5.0 randomMatrix_incremental = incrementalMatrix.get_SparseMatrix() assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
def apply_feature_entropy_UCM(UCM_dict: dict, UCM_names_to_entropy: list): if ~np.all(np.in1d(UCM_names_to_entropy, list(UCM_dict.keys()))): raise KeyError("Mapper contains wrong UCM names") for UCM_name in UCM_names_to_entropy: UCM_object: sps.csr_matrix = UCM_dict[UCM_name] total_interactions_each_row = np.array( UCM_object.sum(axis=1)).squeeze() interactions_each_row = UCM_object.indptr[1:] - UCM_object.indptr[:-1] total_interactions = np.repeat(total_interactions_each_row, interactions_each_row) UCM_object.data = UCM_object.data / total_interactions log_UCM_object = UCM_object.copy() log_UCM_object.data = np.log2(log_UCM_object.data) entropy_matrix = UCM_object.multiply(log_UCM_object) entropy = np.array(entropy_matrix.sum(axis=1)).squeeze() new_UCM_name = "{}_entropy".format(UCM_name) new_row = np.arange(UCM_object.shape[0]) new_col = np.array([0] * len(new_row), dtype=int) new_data = entropy UCM_builder = IncrementalSparseMatrix() UCM_builder.add_data_lists(new_row, new_col, new_data) UCM_dict[new_UCM_name] = UCM_builder.get_SparseMatrix() return UCM_dict
def transform_numerical_to_discretized_sparse_matrix(row: np.ndarray, data: np.ndarray, bins=20): """ Transform a numerical data array into a discretized sparse matrix with a certain amount of bins :param data: array of numerical values :param bins: number of labels in the output :return: sparse matrix containing the discretized values """ if row.size != data.size: raise ValueError("Row size has to be the same of data size") eps = 10e-6 norm_x = (data - data.min()) / (data.max() - data.min() + eps) * 100 step = (norm_x.max() / bins) bins_list = [i * step for i in range(bins)] sparse_matrix_builder = IncrementalSparseMatrix(n_rows=np.max(row) + 1, n_cols=bins + 1) for i, x in enumerate(norm_x): x_dist = np.random.normal(loc=x, scale=step, size=1000) label_x_dist = np.digitize(x_dist, bins_list, right=True) unique_label_x_dist, unique_counts = np.unique(label_x_dist, return_counts=True) unique_counts = unique_counts / np.max( unique_counts) # Normalize unique counts size = unique_label_x_dist.size sparse_matrix_builder.add_data_lists([row[i]] * size, unique_label_x_dist, unique_counts) sparse_matrix = sparse_matrix_builder.get_SparseMatrix() return sparse_matrix
def sample_negative_interactions_uniformly(negative_sample_size, URM, batch_size=10000): n_users = URM.shape[0] n_items = URM.shape[1] invalid_users = np.array(URM.tocoo().row, dtype=np.uint64) invalid_items = np.array(URM.tocoo().col, dtype=np.uint64) # Convert users and items into a unique integers shifted_invalid_items = np.left_shift(invalid_items, np.uint64(np.log2(n_users) + 1)) invalid_tuples = np.bitwise_or(invalid_users, shifted_invalid_items) negative_URM_builder = IncrementalSparseMatrix(n_rows=n_users, n_cols=n_items) with tqdm(desc="Sampling negative interactions", total=negative_sample_size) as p_bar: sampled = 0 while sampled < negative_sample_size: # Sample a batch of users and items users = np.random.randint(low=0, high=n_users, size=batch_size, dtype=np.uint64) items = np.random.randint(low=0, high=n_items, size=batch_size, dtype=np.uint64) # Convert into unique integers shifted_items = np.left_shift(items, np.uint64(np.log2(n_users) + 1)) tuples = np.bitwise_or(users, shifted_items) unique_tuples, indices = np.unique(tuples, return_index=True) # Remove couple of user and items which are already inside the chosen ones invalid_tuples_mask = np.in1d(unique_tuples, invalid_tuples, assume_unique=True) valid_indices = indices[~invalid_tuples_mask] valid_users = users[valid_indices] valid_items = items[valid_indices] # Cap the size of batch size if it is the last batch if sampled + len(valid_users) > negative_sample_size: remaining_sample_size = negative_sample_size - sampled valid_users = valid_users[:remaining_sample_size] valid_items = valid_items[:remaining_sample_size] # Update builder, sampled elements and progress bar negative_URM_builder.add_data_lists(valid_users, valid_items, np.ones(len(valid_users))) sampled += len(valid_users) p_bar.update(len(valid_users)) # Update invalid users and items invalid_tuples = np.concatenate( [invalid_tuples, tuples[valid_indices]]) return negative_URM_builder.get_SparseMatrix().tocsr()
def format_URM_slice_uncompressed(users, items_per_users, max_user_id, n_cols): fm_matrix_builder = IncrementalSparseMatrix(n_cols=n_cols) row_list = np.repeat(np.arange(items_per_users.shape[0] * items_per_users.shape[1]), repeats=2) col_list = np.zeros(shape=items_per_users.shape[0] * items_per_users.shape[1] * 2) user_col_list = np.repeat(users, repeats=items_per_users.shape[1]) items_col_list = np.array(items_per_users).flatten() + max_user_id col_list[np.arange(items_per_users.shape[0] * items_per_users.shape[1]) * 2] = user_col_list col_list[np.arange(items_per_users.shape[0] * items_per_users.shape[1]) * 2 + 1] = items_col_list fm_matrix_builder.add_data_lists(row_list_to_add=row_list, col_list_to_add=col_list, data_list_to_add=np.ones(len(row_list))) return fm_matrix_builder.get_SparseMatrix()
def test_IncrementalSparseMatrix_add_lists(self): n_rows = 100 n_cols = 200 randomMatrix = sps.random(n_rows, n_cols, density=0.01, format='coo') incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols) incrementalMatrix.add_data_lists(randomMatrix.row.copy(), randomMatrix.col.copy(), randomMatrix.data.copy()) randomMatrix_incremental = incrementalMatrix.get_SparseMatrix() assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
def apply_discretization_UCM(UCM_dict, UCM_name_to_bins_mapper: dict): if ~np.all( np.in1d(list(UCM_name_to_bins_mapper.keys()), list( UCM_dict.keys()))): raise KeyError("Mapper contains wrong UCM names") for UCM_name, bins in UCM_name_to_bins_mapper.items(): UCM_object: sps.csr_matrix = UCM_dict[UCM_name] if UCM_object.shape[1] != 1: raise KeyError( "Given UCM name is not regarding a single feature, thus, it cannot be discretized" ) x = np.array(UCM_object.data) labelled_x = transform_numerical_to_label(x, bins) UCM_builder = IncrementalSparseMatrix(n_rows=UCM_object.shape[0]) UCM_builder.add_data_lists(UCM_object.tocoo().row, labelled_x, np.ones(len(labelled_x), dtype=np.float32)) UCM_dict[UCM_name] = UCM_builder.get_SparseMatrix() return UCM_dict
def load_CSV_into_SparseBuilder (filePath, header = False, separator="::"): matrixBuilder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True) fileHandle = open(filePath, "r") numCells = 0 if header: fileHandle.readline() for line in fileHandle: numCells += 1 if (numCells % 1000000 == 0): print("Processed {} cells".format(numCells)) if (len(line)) > 1: line = line.split(separator) line[-1] = line[-1].replace("\n", "") try: user_id = line[0] item_id = line[1] try: value = float(line[2]) if value != 0.0: matrixBuilder.add_data_lists([user_id], [item_id], [value]) except ValueError: print("load_CSV_into_SparseBuilder: Cannot parse as float value '{}'".format(line[2])) except IndexError: print("load_CSV_into_SparseBuilder: Index out of bound in line '{}'".format(line)) fileHandle.close() return matrixBuilder.get_SparseMatrix(), matrixBuilder.get_column_token_to_id_mapper(), matrixBuilder.get_row_token_to_id_mapper()
def fit(self, agg_strategy="FIRST", filter_sample_method="NONE", topK=5, alpha_multiplier=0, constraint_multiplier=1, chain_multiplier=1, filter_items_method="NONE", filter_items_n=100, num_reads=100, **filter_items_parameters): """ It fits the data (i.e. URM_train) by solving an optimization problem for each item. Each optimization problem is generated from the URM_train without the target column and the target column by means of transformation to a QUBO based on "transform_fn" with some regulators; then it is solved by a solver given at the initialization of the class. Then by using the samples collected from the solver, it builds the item-similarity matrix. :param agg_strategy: the post-processing aggregation to be used on the samples :param filter_sample_method: the filter technique used before the post-processing aggregation :param topK: a regulator number that indicates the number of selected variables forced during the optimization :param alpha_multiplier: a multiplier number applied on the constraint of the sparsity regulator term :param constraint_multiplier: a multiplier number applied on the constraint strength of the variable selection regulator :param chain_multiplier: a multiplier number applied on the chain strength of the embedding :param filter_items_method: name of the filtering method to select a set of items for the resolution of the optimization problem :param filter_items_n: number of items to be selected by the filtering method :param num_reads: number of samples to compute from the solver :param filter_items_parameters: other parameters regarding the filter items method """ self._check_fit_parameters(agg_strategy, filter_items_method, filter_sample_method) if filter_items_method == "COSINE": self.FILTER_ITEMS_METHODS["COSINE"] = ItemSelectorByCosineSimilarity(**filter_items_parameters) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] item_pop = np.array((URM_train > 0).sum(axis=0)).flatten() # Need a labeling of variables to order the variables from 0 to n_items. With variable leading zeros based on # the highest number of digits leading_zeros = len(str(n_items - 1)) variables = ["a{:0{}d}".format(i, leading_zeros) for i in range(n_items)] if self.to_resume: start_item = self.df_responses[self.ITEM_ID_COLUMN_NAME].max() else: self.df_responses = pd.DataFrame() start_item = 0 self.FILTER_ITEMS_METHODS[filter_items_method].precompute_best_item_indices(URM_train) matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items) for curr_item in tqdm(range(start_item, n_items), desc="%s: Computing W_sparse matrix" % self.RECOMMENDER_NAME): # get the target column target_column = URM_train[:, curr_item].toarray() # set the "curr_item"-th column of URM_train to zero start_pos = URM_train.indptr[curr_item] end_pos = URM_train.indptr[curr_item + 1] current_item_data_backup = URM_train.data[start_pos: end_pos].copy() URM_train.data[start_pos: end_pos] = 0.0 # select items to be used in the QUBO optimization problem URM = URM_train.copy() URM, mapping_array = self.FILTER_ITEMS_METHODS[filter_items_method].filter_items(URM, target_column, curr_item, filter_items_n) n_variables = len(mapping_array) # get BQM/QUBO problem for the current item qubo = self.LOSSES[self.obj_function].get_qubo_problem(URM, target_column) qubo = qubo + (np.log1p(item_pop[curr_item]) ** 2 + 1) * alpha_multiplier * (np.max(qubo) - np.min(qubo)) \ * np.identity(n_variables) if topK > -1: constraint_strength = max(self.MIN_CONSTRAINT_STRENGTH, constraint_multiplier * (np.max(qubo) - np.min(qubo))) # avoid using the "combinations" function of dimod in order to speed up the computation qubo += -2 * constraint_strength * topK * np.identity(n_variables) + constraint_strength * np.ones( (n_variables, n_variables)) # Generation of the BQM with qubo in a quicker way checked with some performance measuring. On a test of # 2000 n_items, this method is quicker w.r.t. from_numpy_matrix function of dimod bqm = dimod.BinaryQuadraticModel.empty(dimod.BINARY) bqm.add_variables_from(dict(zip(variables, np.diag(qubo)))) for i in range(n_variables): values = np.array(qubo[i, i + 1:]).flatten() + np.array(qubo[i + 1:, i]).flatten() keys = [(variables[i], variables[j]) for j in range(i + 1, n_variables)] bqm.add_interactions_from(dict(zip(keys, values))) self._print("The BQM for item {} is {}".format(curr_item, bqm)) # solve the problem with the solver try: if ("child_properties" in self.solver.properties and self.solver.properties["child_properties"]["category"] == "qpu") \ or "qpu_properties" in self.solver.properties: chain_strength = max(self.MIN_CONSTRAINT_STRENGTH, chain_multiplier * (np.max(qubo) - np.min(qubo))) response = self.solver.sample(bqm, chain_strength=chain_strength, num_reads=num_reads) self._print("Break chain percentage of item {} is {}" .format(curr_item, list(response.data(fields=["chain_break_fraction"])))) self._print("Timing of QPU is %s" % response.info["timing"]) else: response = self.solver.sample(bqm, num_reads=num_reads) self._print("The response for item {} is {}".format(curr_item, response.aggregate())) except OSError as err: traceback.print_exc() raise err # save response in self.responses if self.do_save_responses is True; otherwise apply post-processing # and put the results in the matrix builder response_df = response.to_pandas_dataframe() response_df[self.ITEM_ID_COLUMN_NAME] = curr_item if self.do_save_responses: self.df_responses = self.df_responses.append(response_df, ignore_index=True) self.mapping_matrix.append(mapping_array) else: self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1) self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method, response_df, curr_item, mapping_array) # restore URM_train URM_train.data[start_pos:end_pos] = current_item_data_backup if self.do_save_responses: self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1) self.W_sparse = self.build_similarity_matrix(self.df_responses, agg_strategy, filter_sample_method, self.mapping_matrix) else: self.W_sparse = matrix_builder.get_SparseMatrix()
def split_train_in_two_percentage_user_wise(URM_train, train_percentage=0.1, verbose=False): """ The function splits an URM in two matrices selecting the number of interactions one user at a time :param URM_train: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format( train_percentage) from course_lib.Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix # ensure to use csr matrix or we get big problem URM_train = URM_train.tocsr() num_users, num_items = URM_train.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix( n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) user_no_item_train = 0 user_no_item_validation = 0 for user_id in range(URM_train.shape[0]): start_pos = URM_train.indptr[user_id] end_pos = URM_train.indptr[user_id + 1] user_profile_items = URM_train.indices[start_pos:end_pos] user_profile_ratings = URM_train.data[start_pos:end_pos] user_profile_length = len(user_profile_items) n_train_items = round(user_profile_length * train_percentage) if n_train_items == len(user_profile_items) and n_train_items > 1: n_train_items -= 1 indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int) np.random.shuffle(indices_for_sampling) train_items = user_profile_items[indices_for_sampling[0:n_train_items]] train_ratings = user_profile_ratings[ indices_for_sampling[0:n_train_items]] validation_items = user_profile_items[ indices_for_sampling[n_train_items:]] validation_ratings = user_profile_ratings[ indices_for_sampling[n_train_items:]] if len(train_items) == 0: if verbose: print("User {} has 0 train items".format(user_id)) user_no_item_train += 1 if len(validation_items) == 0: if verbose: print("User {} has 0 validation items".format(user_id)) user_no_item_validation += 1 URM_train_builder.add_data_lists([user_id] * len(train_items), train_items, train_ratings) URM_validation_builder.add_data_lists( [user_id] * len(validation_items), validation_items, validation_ratings) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format( user_no_item_train, user_no_item_train / num_users * 100, num_users)) if user_no_item_validation != 0: print( "Warning: {} ({:.2f} %) of {} users have no sampled items".format( user_no_item_validation, user_no_item_validation / num_users * 100, num_users)) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() return URM_train, URM_validation
def split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.1): """ The function splits an URM in two matrices selecting the number of interactions globally :param URM_all: :param train_percentage: :param verbose: :return: """ assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format( train_percentage) from course_lib.Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix num_users, num_items = URM_all.shape URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_validation_builder = IncrementalSparseMatrix( n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False) URM_train = sps.coo_matrix(URM_all) indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int) np.random.shuffle(indices_for_sampling) n_train_interactions = round(URM_all.nnz * train_percentage) indices_for_train = indices_for_sampling[ indices_for_sampling[0:n_train_interactions]] indices_for_validation = indices_for_sampling[ indices_for_sampling[n_train_interactions:]] URM_train_builder.add_data_lists(URM_train.row[indices_for_train], URM_train.col[indices_for_train], URM_train.data[indices_for_train]) URM_validation_builder.add_data_lists( URM_train.row[indices_for_validation], URM_train.col[indices_for_validation], URM_train.data[indices_for_validation]) URM_train = URM_train_builder.get_SparseMatrix() URM_validation = URM_validation_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) URM_validation = sps.csr_matrix(URM_validation) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no train items".format( user_no_item_train, user_no_item_train / num_users * 100, num_users)) if user_no_item_validation != 0: print( "Warning: {} ({:.2f} %) of {} users have no sampled items".format( user_no_item_validation, user_no_item_validation / num_users * 100, num_users)) return URM_train, URM_validation
def split_train_leave_k_out_user_wise(URM, k_out=1, use_validation_set=True, leave_random_out=True): """ The function splits an URM in two matrices selecting the k_out interactions one user at a time :param URM: :param k_out: :param use_validation_set: :param leave_random_out: :return: """ assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format( k_out) URM = sps.csr_matrix(URM) n_users, n_items = URM.shape URM_train_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix(auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if use_validation_set: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for user_id in range(n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_profile = URM.indices[start_user_position:end_user_position] if leave_random_out: indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_interaction_items = user_profile[indices_to_suffle] user_interaction_data = URM.data[ start_user_position:end_user_position][indices_to_suffle] else: # The first will be sampled so the last interaction must be the first one interaction_position = URM.data[ start_user_position:end_user_position] sort_interaction_index = np.argsort(-interaction_position) user_interaction_items = user_profile[sort_interaction_index] user_interaction_data = URM.data[ start_user_position:end_user_position][sort_interaction_index] #Test interactions user_interaction_items_test = user_interaction_items[0:k_out] user_interaction_data_test = user_interaction_data[0:k_out] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_items_test), user_interaction_items_test, user_interaction_data_test) #validation interactions if use_validation_set: user_interaction_items_validation = user_interaction_items[ k_out:k_out * 2] user_interaction_data_validation = user_interaction_data[ k_out:k_out * 2] URM_validation_builder.add_data_lists( [user_id] * k_out, user_interaction_items_validation, user_interaction_data_validation) #Train interactions train_limit = k_out * 2 if use_validation_set else k_out user_interaction_items_train = user_interaction_items[train_limit:] user_interaction_data_train = user_interaction_data[train_limit:] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train = URM_train_builder.get_SparseMatrix() URM_test = URM_test_builder.get_SparseMatrix() URM_train = sps.csr_matrix(URM_train) user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0) if user_no_item_train != 0: print("Warning: {} ({:.2f} %) of {} users have no Train items".format( user_no_item_train, user_no_item_train / n_users * 100, n_users)) if use_validation_set: URM_validation = URM_validation_builder.get_SparseMatrix() URM_validation = sps.csr_matrix(URM_validation) user_no_item_validation = np.sum( np.ediff1d(URM_validation.indptr) == 0) if user_no_item_validation != 0: print( "Warning: {} ({:.2f} %) of {} users have no Validation items". format(user_no_item_validation, user_no_item_validation / n_users * 100, n_users)) return URM_train, URM_validation, URM_test return URM_train, URM_test