def test_exclude_unknowns_empty_error(self): try: Dataset.build(self.triplet_data, exclude_unknowns=True) except ValueError: assert True
def _build_stratified_datasets(self, train_data, test_data, val_data): if train_data is None or len(train_data) == 0: raise ValueError("train_data is required but None or empty!") if test_data is None or len(test_data) == 0: raise ValueError("test_data is required but None or empty!") self.global_uid_map.clear() self.global_iid_map.clear() # build training set self.train_set = Dataset.build( data=train_data, fmt=self.fmt, global_uid_map=self.global_uid_map, global_iid_map=self.global_iid_map, seed=self.seed, exclude_unknowns=False, ) if self.verbose: print("---") print("Training data:") print("Number of users = {}".format(self.train_set.num_users)) print("Number of items = {}".format(self.train_set.num_items)) print("Number of ratings = {}".format(self.train_set.num_ratings)) print("Max rating = {:.1f}".format(self.train_set.max_rating)) print("Min rating = {:.1f}".format(self.train_set.min_rating)) print("Global mean = {:.1f}".format(self.train_set.global_mean)) # build test set self.test_set = Dataset.build( data=test_data, fmt=self.fmt, global_uid_map=self.global_uid_map, global_iid_map=self.global_iid_map, seed=self.seed, exclude_unknowns=self.exclude_unknowns, ) if self.verbose: print("---") print("Test data (Q0):") print("Number of users = {}".format(len(self.test_set.uid_map))) print("Number of items = {}".format(len(self.test_set.iid_map))) print("Number of ratings = {}".format(self.test_set.num_ratings)) print("Max rating = {:.1f}".format(self.test_set.max_rating)) print("Min rating = {:.1f}".format(self.test_set.min_rating)) print("Global mean = {:.1f}".format(self.test_set.global_mean)) print( "Number of unknown users = {}".format( self.test_set.num_users - self.train_set.num_users ) ) print( "Number of unknown items = {}".format( self.test_set.num_items - self.train_set.num_items ) ) # build stratified datasets self.stratified_sets = {} # match the corresponding propensity score for each feedback test_props = np.array([self.props[i] for u, i, r in test_data], dtype=np.float64) # stratify strata, bins = pd.cut(x=test_props, bins=self.n_strata, labels=['Q%d' % i for i in range(1, self.n_strata+1)], retbins=True) for stratum in sorted(np.unique(strata)): # sample the corresponding sub-population qtest_data = [] for (u, i, r), q in zip(test_data, strata): if q == stratum: qtest_data.append((u, i, r)) # build a dataset qtest_set = Dataset.build( data=qtest_data, fmt=self.fmt, global_uid_map=self.global_uid_map, global_iid_map=self.global_iid_map, seed=self.seed, exclude_unknowns=self.exclude_unknowns, ) if self.verbose: print("---") print("Test data ({}):".format(stratum)) print("Number of users = {}".format( len(qtest_set.uid_map))) print("Number of items = {}".format( len(qtest_set.iid_map))) print("Number of ratings = {}".format( qtest_set.num_ratings)) print("Max rating = {:.1f}".format(qtest_set.max_rating)) print("Min rating = {:.1f}".format(qtest_set.min_rating)) print("Global mean = {:.1f}".format(qtest_set.global_mean)) print( "Number of unknown users = {}".format( qtest_set.num_users - self.train_set.num_users ) ) print( "Number of unknown items = {}".format( self.test_set.num_items - self.train_set.num_items ) ) self.stratified_sets[stratum] = qtest_set if val_data is not None and len(val_data) > 0: self.val_set = Dataset.build( data=val_data, fmt=self.fmt, global_uid_map=self.global_uid_map, global_iid_map=self.global_iid_map, seed=self.seed, exclude_unknowns=self.exclude_unknowns, ) if self.verbose: print("---") print("Validation data:") print("Number of users = {}".format(len(self.val_set.uid_map))) print("Number of items = {}".format(len(self.val_set.iid_map))) print("Number of ratings = {}".format(self.val_set.num_ratings)) if self.verbose: print("---") print("Total users = {}".format(self.total_users)) print("Total items = {}".format(self.total_items)) self.train_set.total_users = self.total_users self.train_set.total_items = self.total_items self._build_modalities() return self