def user_based_split(self, folds_num=5): """ Splits the rating matrix following the user-based method, the result after invoking this method is: two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same format, as following: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default 5 :return: None """ train = [[[] for _ in range(self.num_users)] for _ in range(folds_num)] test = [[[] for _ in range(self.num_users)] for _ in range(folds_num)] for user in range(self.num_users): if user % 1000 == 0: print("user_{}".format(user)) items_ids = np.array(self.users_ratings[user]) n = len(items_ids) if n >= folds_num: idx = list(range(n)) item_ids_folds = random_divide(idx, folds_num) for fold in range(folds_num): test_idx = item_ids_folds[fold] train_idx = [id for id in idx if id not in test_idx] train[fold][user].extend(items_ids[train_idx].tolist()) test[fold][user].extend(items_ids[test_idx].tolist()) else: for fold in range(folds_num): train[fold][user].extend(items_ids.tolist()) test[fold][user].extend([]) stats = Stats(self.generate_validation) for fold in range(folds_num): users_train = train[fold] items_train = self.items_mat_from_users_ratings(users_train) for u in users_train: if len(u) == 0: print("some users contains 0 training items, split again again!") raise Exception("Split_Error!") write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) users_test = test[fold] items_test = self.items_mat_from_users_ratings(users_test) # Storing the fold test items for all users write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) # Calculate statistics: #TODO: Calculate Validation sets: users_validation = [] items_validation = [] if self.generate_validation: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test) # calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
def split(self): # Get the mapping as a list of user_hash where the key is the corresponding index: userhash_userid_map_list = list(self.users_dict.items()) userhash_userid_map_list.sort(key=lambda x: x[1]) user_id_userhash_map_list = np.array( [i for (i, _) in userhash_userid_map_list]) # Get the mapping as a list of doc_ids where the key is the corresponding index: docid_paperid_map_list = list(self.papers_dict.items()) docid_paperid_map_list.sort(key=lambda x: x[1]) paper_id_docid_map_list = np.array( [i for (i, _) in docid_paperid_map_list]) # Get the ratings list integrated with time stamps: ratings_list = self.integrate_raings_timestamp(self.users_dict, self.papers_dict) fr = pd.DataFrame(data=ratings_list, columns=['user', 'paper', 'date']) print("Ratings: {}, users: {}, papers: {}.".format( len(fr), fr.user.nunique(), fr.paper.nunique())) # First split date: d1 = datetime.strptime('2005-03-31', "%Y-%m-%d").date() # Last date: last_date = fr.date.max() ratings_period = (last_date.year - d1.year) * 12 + last_date.month # These lists are used for plotting: tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates = [], [], [], [], [], [], [], [] folds_num = ratings_period // self.split_duration # For split stats: stats_header = ['{:4}'.format('Fold'), '{:20}'.format('#Usrs(Tot,R,S)'),'{:23}'.format('#Itms(Tot,R,S)'),'{:23}'.format('#Rtng(Tot,R,S)'),\ '{:23}'.format('PRU(min/max/avg/std)'), '{:22}'.format('PSU(min/max/avg/std)'), '{:20}'.format('PRI(min/max/avg/std)'), '{:20}'.format('PSI(min/max/avg/std)')] self.stat_list.append(stats_header) stats = Stats() for fold in range(folds_num): d2 = d1 + relativedelta(months=self.split_duration) # Training ratings: f1 = fr[fr['date'] < d1] # Test ratings: if self.out_of_matrix: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user'])] else: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user']) & (fr['paper'].isin(f1['paper']))] print("{}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%"\ .format(d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100)) # Generate data for the folds: train_l_users, train_l_users_age, train_l_items, test_l_users, test_l_items, useridx_user_id_map_list, paperidx_paper_id_map_list, n_users, n_papers = self.generate_fold( d1, f1, f2) stats.add_fold_statistics(fold + 1, train_l_users, test_l_users, train_l_items, test_l_items) """ tru = [len(i) for i in train_l_users] tsu = [len(i) for i in test_l_users] tri = [len(i) for i in train_l_items] tsi = [len(i) for i in test_l_items] self.stat_list.append(['{:4}'.format(fold + 1), '{:5d} / {:5d} / {:4d}'.format(n_users, f1.user.nunique(), f2.user.nunique()), '{:6d} / {:6d} / {:5d}'.format(n_papers, f1.paper.nunique(), f2.paper.nunique()),\ '{:6d} / {:6d} / {:5d}'.format(f1.shape[0]+ f2.shape[0], f1.shape[0], f2.shape[0]), \ '{:1d} / {:4d} / {:4.1f} / {:5.1f}'.format(np.min(tru), np.max(tru), np.mean(tru), np.std(tru)),\ '{:1d} / {:4d} / {:4.1f} / {:4.1f}'.format(np.min(tsu), np.max(tsu), np.mean(tsu), np.std(tsu)),\ '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tri), np.max(tri), np.mean(tri), np.std(tri)),\ '{:1d} / {:3d} / {:4.1f} / {:3.1f}'.format(np.min(tsi), np.max(tsi), np.mean(tsi), np.std(tsi))]) """ # Write to file: fold_folder = os.path.join( self.base_dir, 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'fold-{}'.format(fold + 1)) if not os.path.exists(fold_folder): os.makedirs(fold_folder) write_ratings(train_l_users, os.path.join(fold_folder, 'train-users.dat')) write_ratings(train_l_users_age, os.path.join(fold_folder, 'train-users-ages.dat')) write_ratings(test_l_users, os.path.join(fold_folder, 'test-users.dat')) write_ratings(train_l_items, os.path.join(fold_folder, 'train-items.dat')) write_ratings(test_l_items, os.path.join(fold_folder, 'test-items.dat')) print("Generating the new mult file...") self.generate_docs_terms(self.docs_vocabs, paperidx_paper_id_map_list, self.terms, fold_folder) # Write users and papers mappings to files: useridx_userhash = user_id_userhash_map_list[ useridx_user_id_map_list] write_list_to_file( [(j, i) for (i, j) in enumerate(useridx_userhash)], os.path.join(fold_folder, 'citeulikeUserHash_userId_map.dat'), header=['citeulikeUserHash', 'user_id']) paperidx_docid = paper_id_docid_map_list[ paperidx_paper_id_map_list] write_list_to_file([(j, i) for (i, j) in enumerate(paperidx_docid)], os.path.join(fold_folder, 'citeulikeId_docId_map.dat'), header=['citeulikeId', 'paper_id']) # For plotting: dates.append(d2) tr_rs.append(len(f1)) tr_us.append(f1.user.nunique()) tr_ps.append(f1.paper.nunique()) ts_rs.append(len(f2)) ts_us.append(f2.user.nunique()) ts_ps.append(f2.paper.nunique()) rat.append(len(f2) / len(f1) * 100) d1 = d2 self.plot_split_lines(tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates) # Write split statistics to file: stats.save_stats_to_file( os.path.join( self.base_dir, 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'stats.txt'))
def out_of_matrix_split(self, folds_num=5): """ Splits the rating matrix following the out-of-matrix method defined in CTR, the result after invoking this method is: two files for each fold (out_of-train-fold_id-users.dat and out_of-train-fold_id-users.dat), both files have the same following format: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default = 5 :return: None """ # 1- Split items ids in folds: items_ids = list(range(self.num_items)) item_ids_folds = random_divide(items_ids, folds_num) # 2- Generate the training and test sets for each fold: stats = Stats(self.generate_validation) for test_fold in range(folds_num): # Get the test, validation and training items: items_test_ids = set(item_ids_folds[test_fold]) items_validation_ids = set() if self.generate_validation: # Add items of the next fold as validation validation_fold = (test_fold + 1) % folds_num items_validation_ids = set(item_ids_folds[validation_fold]) # Add the rest as training: items_train_ids = set(items_ids) - items_test_ids - items_validation_ids # Generate users ratings for training, test and validation: users_train = [] users_test = [] users_validation = [] for user_ratings in self.users_ratings: tr_ratings = list(items_train_ids.intersection(user_ratings)) if len(tr_ratings) == 0: print("some users contains 0 training items, split again again!") raise Exception("Split_Error!") tes_ratings = list(items_test_ids.intersection(user_ratings)) val_ratings = list(items_validation_ids.intersection(user_ratings)) tr_ratings.sort() tes_ratings.sort() val_ratings.sort() users_train.append(tr_ratings) users_test.append(tes_ratings) users_validation.append(val_ratings) write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-users.dat".format(test_fold + 1)), delimiter=self.delimiter) items_train = self.items_mat_from_users_ratings(users_train) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "train-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) items_test = self.items_mat_from_users_ratings(users_test) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "test-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) items_validation = self.items_mat_from_users_ratings(users_validation) write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "validation-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter) # Saving left out items ids: items_test_lst = list(items_test) items_test_lst.sort() write_ratings(items_test_lst, filename=os.path.join(self.out_folder, "fold-{}".format(test_fold + 1), "heldout-set-fold_{}-items.dat".format(test_fold + 1)), delimiter=self.delimiter, print_line_length=False) # Calculate statistics: if self.generate_validation: stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(test_fold + 1, users_train, users_test, items_train, items_test) # calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))
def cf_split(self, folds_num=5): """ Splits the rating matrix following the in-matrix method defined in CTR, the result after invoking this method is: two files for each fold (cf-train-fold_id-users.dat and cf-train-fold_id-users.dat), both files have the same following format: line i has delimiter-separated list of item ids rated by user i :param folds_num: the number of folds, default 5 :return: None """ items_mat = self.items_mat_from_users_ratings(self.users_ratings) train = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] test = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] validation = [[[] for _ in range(self.num_items)] for _ in range(folds_num)] print("Number of items: {}".format(self.num_items)) folds_list = list(range(folds_num)) print("Splitting items ratings, progress:") # 1- Split items ratings into the folds. This guarantees that all items appear at least once in the test set. # If generating validation set is required: if self.generate_validation: for item in range(self.num_items): # Reporting progress: if item % 5000 == 0: print("doc_{}".format(item)) user_ids = np.array(items_mat[item]) n = len(user_ids) # If the number of ratings associated to this item are greater than the number of folds then, this item' ratings can participate in both the training and in the test sets. if n >= folds_num: idx = list(range(n)) user_ids_folds = random_divide(idx, folds_num) for test_fold in folds_list: # Add users of the current fold as test test_idx = user_ids_folds[test_fold] # Add users of the next fold as validation validation_fold = (test_fold + 1) % folds_num validation_idx = user_ids_folds[validation_fold] # Add the rest as training: train_idx = [] for i in folds_list: if i != test_fold and i != validation_fold: train_idx.extend(user_ids_folds[i]) train[test_fold][item].extend(user_ids[train_idx].tolist()) test[test_fold][item].extend(user_ids[test_idx].tolist()) validation[test_fold][item].extend(user_ids[validation_idx].tolist()) # If the number of ratings associated to this item are less than the number of folds then, this item's ratings can appear in the training set only. else: for fold in folds_list: train[fold][item].extend(user_ids.tolist()) test[fold][item].extend([]) validation[fold][item].extend([]) # If generating validation set is not required, generate Test and Training sets only: else: for item in range(self.num_items): if item % 5000 == 0: print("doc_{}".format(item)) user_ids = np.array(items_mat[item]) n = len(user_ids) if n >= folds_num: idx = list(range(n)) user_ids_folds = random_divide(idx, folds_num) for test_fold in folds_list: # Add users of the current fold as test test_idx = user_ids_folds[test_fold] # Add the rest as training: train_idx = [id for id in idx if id not in test_idx] train[test_fold][item].extend(user_ids[train_idx].tolist()) test[test_fold][item].extend(user_ids[test_idx].tolist()) else: for fold in folds_list: train[fold][item].extend(user_ids.tolist()) test[fold][item].extend([]) # 2- Generate the user ratings from the splits generated on step 1. stats = Stats(self.generate_validation) for fold in folds_list: items_train = train[fold] users_train = self.users_mat_from_items(items_train) for u_id, u in enumerate(users_train): if len(u) == 0: print("User {} contains 0 training items, split again!".format(u_id)) raise Exception("Split_Error!") write_ratings(users_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_train, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "train-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) items_test = test[fold] users_test = self.users_mat_from_items(items_test) write_ratings(users_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_test, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "test-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) if self.generate_validation: items_validation = validation[fold] users_validation = self.users_mat_from_items(items_validation) # Storing the fold validation items for all users write_ratings(users_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-users.dat".format(fold + 1)), delimiter=self.delimiter) write_ratings(items_validation, filename=os.path.join(self.out_folder, "fold-{}".format(fold + 1), "validation-fold_{}-items.dat".format(fold + 1)), delimiter=self.delimiter) # Calculate statistics: if self.generate_validation: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test, users_validation, items_validation) else: stats.add_fold_statistics(fold + 1, users_train, users_test, items_train, items_test) #calculate_split_stats(users_train, users_test, items_train, items_test, fold) # Write split statistics: stats.save_stats_to_file(os.path.join(self.out_folder, 'stats.txt'))