def split(self): stats = Stats() # Get the mapping as a list of user_hash where the key is the corresponding index: userhash_userid_map_list = list(self.users_dict.items()) userhash_userid_map_list.sort(key=lambda x: x[1]) user_id_userhash_map_list = np.array([i for (i, _) in userhash_userid_map_list]) # Get the mapping as a list of doc_ids where the key is the corresponding index: docid_paperid_map_list = list(self.papers_dict.items()) docid_paperid_map_list.sort(key=lambda x: x[1]) paper_id_docid_map_list = np.array([i for (i, _) in docid_paperid_map_list]) # Get the ratings list integrated with time stamps: ratings_list = self.integrate_raings_timestamp(self.users_dict, self.papers_dict) fr = pd.DataFrame(data=ratings_list, columns=['user', 'paper', 'date']) print("Ratings: {}, users: {}, papers: {}.".format(len(fr), fr.user.nunique(), fr.paper.nunique())) stats.add_to_text_list('Data statisitcs:') stats.add_to_text_list("#Users: {}, #Papers: {}, #Ratings: {}.".format(fr.user.nunique() , fr.paper.nunique(), len(fr))) # First split date: d1 = datetime.strptime('2005-03-31', "%Y-%m-%d").date() # Last date: last_date = fr.date.max() ratings_period = (last_date.year - d1.year) * 12 + last_date.month # These lists are used for plotting: tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates = [], [], [], [], [], [], [], [] folds_num = ratings_period // self.split_duration # For split stats: stats_header = ['{:4}'.format('Fold'), '{:20}'.format('#Usrs(Tot,R,S)'),'{:23}'.format('#Itms(Tot,R,S)'),'{:23}'.format('#Rtng(Tot,R,S)'),\ '{:23}'.format('PRU(min/max/avg/std)'), '{:22}'.format('PSU(min/max/avg/std)'), '{:20}'.format('PRI(min/max/avg/std)'), '{:20}'.format('PSI(min/max/avg/std)')] self.stat_list.append(stats_header) for fold in range(folds_num): d2 = d1 + relativedelta(months=self.split_duration) # Training ratings: f1 = fr[fr['date'] < d1] # Test ratings: if self.out_of_matrix: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user'])] else: f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user']) & (fr['paper'].isin(f1['paper']))] print("{}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%"\ .format(d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100)) stats.add_to_text_list("Fold-{}: {}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%" \ .format(fold+1, d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100)) # Generate data for the folds: train_l_users, train_l_users_age, train_l_items, train_l_items_age, test_l_users, test_l_items, useridx_user_id_map_list, paperidx_paper_id_map_list, candidate_items, n_users, n_papers = self.generate_fold(d1,f1, f2) stats.add_fold_statistics(fold+1, train_l_users, test_l_users, train_l_items, test_l_items) # Write to file: fname = 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix' fname += "_CanLim_"+str(self.limit_candidates) if self.limit_candidates>0 else '' fold_folder = os.path.join(self.base_dir,fname, 'fold-{}'.format(fold+1)) if not os.path.exists(fold_folder): os.makedirs(fold_folder) write_ratings(train_l_users, os.path.join(fold_folder, 'train-users.dat')) write_ratings(train_l_users_age, os.path.join(fold_folder, 'train-users-ages.dat')) write_ratings(test_l_users, os.path.join(fold_folder, 'test-users.dat')) write_ratings(train_l_items, os.path.join(fold_folder, 'train-items.dat')) write_ratings(train_l_items_age, os.path.join(fold_folder, 'train-items-ages.dat')) write_ratings(test_l_items, os.path.join(fold_folder, 'test-items.dat')) write_ratings(candidate_items, os.path.join(fold_folder, 'candidate-items.dat'), print_line_length=False ) print("Generating the new mult file...") self.generate_docs_terms(self.docs_vocabs, paperidx_paper_id_map_list, self.terms, fold_folder) # Write users and papers mappings to files: useridx_userhash = user_id_userhash_map_list[useridx_user_id_map_list] write_list_to_file([(j, i) for (i, j) in enumerate(useridx_userhash)], os.path.join(fold_folder, 'userhash_user_id_map.csv'), header=['citeulikeUserHash', 'user_id'], delimiter=",") paperidx_docid = paper_id_docid_map_list[paperidx_paper_id_map_list] write_list_to_file([(j, i) for (i, j) in enumerate(paperidx_docid)], os.path.join(fold_folder, 'citeulike_id_doc_id_map.csv'), header=['citeulikeId', 'paper_id'], delimiter=",") # For plotting: dates.append(d2) tr_rs.append(len(f1)) tr_us.append(f1.user.nunique()) tr_ps.append(f1.paper.nunique()) ts_rs.append(len(f2)) ts_us.append(f2.user.nunique()) ts_ps.append(f2.paper.nunique()) rat.append(len(f2) / len(f1) * 100) d1 = d2 self.plot_split_lines(tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates) # Write split statistics to file: stats.save_stats_to_file(os.path.join(self.base_dir,'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'stats.txt'))