def split(self, dataset): super(Holdout, self).split(dataset) URM = sps.csr_matrix(dataset.get_URM()) n_users, n_items = dataset.n_users, dataset.n_items user_indices = [] URM_train, URM_test, URM_validation = {}, {}, {} #Select apriori how to randomizely sort every user users_to_remove = [] for user_id in range(n_users): assignment = np.random.choice( 3, URM.indptr[user_id + 1] - URM.indptr[user_id], replace=True, p=[self.train_perc, self.validation_perc, self.test_perc]) assignments = [assignment == i for i in range(3)] #if assignments[2].sum() <= 0: #No interactions in test # users_to_remove.append(user_id) #if self.with_validation and assignments[1].sum() <= 0: #No interactions in validation # users_to_remove.append(user_id) if not self.allow_cold_users and assignments[0].sum() <= 0: #No interactions in train users_to_remove.append(user_id) user_indices.append(assignments) for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name) URM = sps.csr_matrix(URM) URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) users_to_remove_index = 0 for user_id in range(n_users): if users_to_remove_index < len( users_to_remove ) and user_id == users_to_remove[users_to_remove_index]: users_to_remove_index += 1 continue indices = user_indices[user_id] start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions user_interaction_items_test = user_interaction_items[ indices[2]] user_interaction_data_test = user_interaction_data[indices[2]] mask = user_interaction_data_test > self.test_rating_threshold user_interaction_items_test = user_interaction_items_test[mask] user_interaction_data_test = user_interaction_data_test[mask] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_data_test), user_interaction_items_test, user_interaction_data_test) # validation interactions if self.with_validation: user_interaction_items_validation = user_interaction_items[ indices[1]] user_interaction_data_validation = user_interaction_data[ indices[1]] # Remove from validation interactions below a given threshold mask = user_interaction_data_validation > self.test_rating_threshold user_interaction_items_validation = user_interaction_items_validation[ mask] user_interaction_data_validation = user_interaction_data_validation[ mask] URM_validation_builder.add_data_lists( [user_id] * len(user_interaction_data_validation), user_interaction_items_validation, user_interaction_data_validation) # Train interactions user_interaction_items_train = user_interaction_items[ indices[0]] user_interaction_data_train = user_interaction_data[indices[0]] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test
def split(self, dataset): super(ColdItemsHoldout, self).split(dataset) n_users, n_items = dataset.n_users, dataset.n_items URM_train, URM_test, URM_validation = {}, {}, {} items_split = np.random.choice( 3, n_items, replace=True, p=[self.train_perc, self.validation_perc, self.test_perc]) train_items = np.arange(n_items)[items_split == 0] validation_items = np.arange(n_items)[items_split == 1] test_items = np.arange(n_items)[items_split == 2] #Select apriori how to randomizely sort every user users_to_remove = [] for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name) URM = sps.csr_matrix(URM) URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for user_id in range(n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions indices = np.in1d(user_interaction_items, test_items, assume_unique=True) user_interaction_items_test = user_interaction_items[indices] user_interaction_data_test = user_interaction_data[indices] # Remove from test interactions below a given threshold mask = user_interaction_data_test > self.test_rating_threshold user_interaction_items_test = user_interaction_items_test[mask] user_interaction_data_test = user_interaction_data_test[mask] URM_test_builder.add_data_lists( [user_id] * len(user_interaction_data_test), user_interaction_items_test, user_interaction_data_test) # validation interactions if self.with_validation: indices = np.in1d(user_interaction_items, validation_items, assume_unique=True) user_interaction_items_validation = user_interaction_items[ indices] user_interaction_data_validation = user_interaction_data[ indices] # Remove from validation interactions below a given threshold mask = user_interaction_data_validation > self.test_rating_threshold user_interaction_items_validation = user_interaction_items_validation[ mask] user_interaction_data_validation = user_interaction_data_validation[ mask] URM_validation_builder.add_data_lists( [user_id] * len(user_interaction_data_validation), user_interaction_items_validation, user_interaction_data_validation) #if len(user_interaction_items_validation) <= 0: # users_to_remove.append(user_id) # Train interactions indices = np.in1d(user_interaction_items, train_items, assume_unique=True) user_interaction_items_train = user_interaction_items[indices] user_interaction_data_train = user_interaction_data[indices] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) #if len(user_interaction_items_test) <= 0: # users_to_remove.append(user_id) if not self.allow_cold_users and len( user_interaction_items_train) <= 0: users_to_remove.append(user_id) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test
def split(self, dataset): super(WarmItemsKFold, self).split(dataset) # I can do the kfold of a slice of the initial URM! if self.percentage_initial_data_to_split < 1.0: h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split) dataset = h.split(dataset)[0] folds = [] URM = dataset.get_URM().tocoo() split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging == i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocoo() # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1]))) urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])), shape=URM.shape) urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r
def split(self, dataset): super(ColdItemsKFold, self).split(dataset) folds = [] split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True) for i in range(self.n_folds): urm = {} urm_mappers = {} mask = split_belonging != i for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocsc(copy=True) # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs for j in np.arange(URM.shape[1])[mask].tolist(): URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0 URM.eliminate_zeros() urm[URM_name] = URM.tocsr() urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name) folds.append( Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict() ) ) r = [] for i in range(self.n_folds): urm = {} urm_mappers = {} for URM_name in folds[i].get_URM_names(): # Keep i-th fold as test and merge the others as train urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name) urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name) for j in range(2, self.n_folds): urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name) train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=urm_mappers, ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) urm = {} test_urm = folds[i].get_URM() test_urm.sort_indices() mask = test_urm.data <= self.test_rating_threshold for URM_name in folds[i].get_URM_names(): urm[URM_name] = folds[i].get_URM(URM_name) urm[URM_name].sort_indices() urm[URM_name].data[mask] = 0.0 urm[URM_name].eliminate_zeros() test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(), postprocessings=folds[i].get_postprocessings(), URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(), ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(), UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict()) if not self.allow_cold_users: users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0] train.remove_users(users_to_remove) test.remove_users(users_to_remove) r.append((train, test)) return r
def split(self, dataset): super(LeaveKOut, self).split(dataset) URM = sps.csr_matrix(dataset.get_URM()) URM.sort_indices() split_number = 2 if self.with_validation: split_number += 1 # Min interactions at least self.k_value for each split +1 for train min_user_interactions = split_number * (self.k_value - 1) + 1 users_to_preserve = np.arange(URM.shape[0]) if not self.allow_cold_users: urm_threshold = URM.copy() urm_threshold.data[ urm_threshold.data <= self.test_rating_threshold] = 0 urm_threshold.eliminate_zeros() user_interactions = np.ediff1d(urm_threshold.tocsr().indptr) users_to_preserve = users_to_preserve[ user_interactions >= min_user_interactions] print( "DataSplitterLeaveKOut: Removing {} of {} users because they have less than the {} interactions required for {} splits" .format(URM.shape[0] - len(users_to_preserve), URM.shape[0], min_user_interactions, split_number)) users_to_remove = np.setdiff1d(np.arange(URM.shape[0]), users_to_preserve) n_users, n_items = URM.shape user_indices = [] URM_train, URM_test, URM_validation = {}, {}, {} #Select apriori how to randomizely sort every user for user_id in users_to_preserve.tolist(): user_profile = URM.data[URM.indptr[user_id]:URM. indptr[user_id + 1]] > self.test_rating_threshold test_and_val = np.random.permutation( np.arange(URM.indptr[user_id + 1] - URM.indptr[user_id])[user_profile]) limit = self.k_value if self.with_validation: limit = self.k_value * 2 # Train, Test and Validation user_indices.append((np.setdiff1d(np.arange(len(user_profile)), test_and_val[:limit]), test_and_val[:self.k_value], test_and_val[self.k_value:limit])) for URM_name in dataset.get_URM_names(): URM = dataset.get_URM(URM_name).tocsr() URM.sort_indices() URM_train_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) URM_test_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) if self.with_validation: URM_validation_builder = IncrementalSparseMatrix( auto_create_row_mapper=False, n_rows=n_users, auto_create_col_mapper=False, n_cols=n_items) for i, user_id in enumerate(users_to_preserve.tolist()): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] indices = user_indices[i] user_interaction_items = URM.indices[ start_user_position:end_user_position] user_interaction_data = URM.data[ start_user_position:end_user_position] # Test interactions user_interaction_items_test = user_interaction_items[ indices[1]] user_interaction_data_test = user_interaction_data[indices[1]] URM_test_builder.add_data_lists([user_id] * self.k_value, user_interaction_items_test, user_interaction_data_test) train_start = self.k_value # validation interactions if self.with_validation: user_interaction_items_validation = user_interaction_items[ indices[2]] user_interaction_data_validation = user_interaction_data[ indices[2]] URM_validation_builder.add_data_lists( [user_id] * self.k_value, user_interaction_items_validation, user_interaction_data_validation) train_start = self.k_value * 2 # Train interactions user_interaction_items_train = user_interaction_items[ indices[0]] user_interaction_data_train = user_interaction_data[indices[0]] URM_train_builder.add_data_lists( [user_id] * len(user_interaction_items_train), user_interaction_items_train, user_interaction_data_train) URM_train[URM_name] = URM_train_builder.get_SparseMatrix() URM_test[URM_name] = URM_test_builder.get_SparseMatrix() if self.with_validation: URM_validation[ URM_name] = URM_validation_builder.get_SparseMatrix() train = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_train, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) train.remove_users(users_to_remove) test = Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_test, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) test.remove_users(users_to_remove) if self.with_validation: validation = Dataset( dataset.get_name(), base_folder=dataset.get_base_folder(), postprocessings=dataset.get_postprocessings(), URM_dict=URM_validation, URM_mappers_dict=dataset.get_URM_mappers_dict(), ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(), UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()) validation.remove_users(users_to_remove) return train, test, validation else: return train, test