Example #1
0
    def split(self):
        stats = Stats()
        # Get the mapping as a list of user_hash where the key is the corresponding index:
        userhash_userid_map_list = list(self.users_dict.items())
        userhash_userid_map_list.sort(key=lambda x: x[1])
        user_id_userhash_map_list = np.array([i for (i, _) in userhash_userid_map_list])

        # Get the mapping as a list of doc_ids where the key is the corresponding index:
        docid_paperid_map_list = list(self.papers_dict.items())
        docid_paperid_map_list.sort(key=lambda x: x[1])
        paper_id_docid_map_list = np.array([i for (i, _) in docid_paperid_map_list])

        # Get the ratings list integrated with time stamps:
        ratings_list = self.integrate_raings_timestamp(self.users_dict, self.papers_dict)

        fr = pd.DataFrame(data=ratings_list, columns=['user', 'paper', 'date'])
        print("Ratings: {}, users: {}, papers: {}.".format(len(fr), fr.user.nunique(), fr.paper.nunique()))
        stats.add_to_text_list('Data statisitcs:')
        stats.add_to_text_list("#Users: {}, #Papers: {}, #Ratings: {}.".format(fr.user.nunique() , fr.paper.nunique(), len(fr)))
        # First split date:
        d1 = datetime.strptime('2005-03-31', "%Y-%m-%d").date()

        # Last date:
        last_date = fr.date.max()
        ratings_period = (last_date.year - d1.year) * 12 + last_date.month

        # These lists are used for plotting:
        tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates = [], [], [], [], [], [], [], []

        folds_num = ratings_period // self.split_duration

        # For split stats:
        stats_header = ['{:4}'.format('Fold'), '{:20}'.format('#Usrs(Tot,R,S)'),'{:23}'.format('#Itms(Tot,R,S)'),'{:23}'.format('#Rtng(Tot,R,S)'),\
                        '{:23}'.format('PRU(min/max/avg/std)'), '{:22}'.format('PSU(min/max/avg/std)'), '{:20}'.format('PRI(min/max/avg/std)'), '{:20}'.format('PSI(min/max/avg/std)')]
        self.stat_list.append(stats_header)

        for fold in range(folds_num):
            d2 = d1 + relativedelta(months=self.split_duration)

            # Training ratings:
            f1 = fr[fr['date'] < d1]

            # Test ratings:
            if self.out_of_matrix:
                f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user'])]
            else:
                f2 = fr[(fr['date'] >= d1) & (fr['date'] < d2) & fr['user'].isin(f1['user']) & (fr['paper'].isin(f1['paper']))]
            print("{}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%"\
                  .format(d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100))
            stats.add_to_text_list("Fold-{}: {}->{}, Tr:[Rs: {:6}, Us: {:5}, Ps: {:6}], Te:[Rs: {:5}, Us: {:5}, Ps: {:6}], Ratio: {:04.2f}%" \
                                   .format(fold+1, d1, d2, len(f1), f1.user.nunique(), f1.paper.nunique(), len(f2), f2.user.nunique(), f2.paper.nunique(), len(f2) / len(f1) * 100))
            # Generate data for the folds:
            train_l_users, train_l_users_age, train_l_items, train_l_items_age, test_l_users, test_l_items, useridx_user_id_map_list, paperidx_paper_id_map_list, candidate_items, n_users, n_papers = self.generate_fold(d1,f1, f2)
            stats.add_fold_statistics(fold+1, train_l_users, test_l_users, train_l_items, test_l_items)

            # Write to file:
            fname = 'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix'
            fname += "_CanLim_"+str(self.limit_candidates) if self.limit_candidates>0 else ''

            fold_folder = os.path.join(self.base_dir,fname, 'fold-{}'.format(fold+1))
            if not os.path.exists(fold_folder):
                os.makedirs(fold_folder)

            write_ratings(train_l_users, os.path.join(fold_folder, 'train-users.dat'))
            write_ratings(train_l_users_age, os.path.join(fold_folder, 'train-users-ages.dat'))
            write_ratings(test_l_users, os.path.join(fold_folder, 'test-users.dat'))
            write_ratings(train_l_items, os.path.join(fold_folder, 'train-items.dat'))
            write_ratings(train_l_items_age, os.path.join(fold_folder, 'train-items-ages.dat'))
            write_ratings(test_l_items, os.path.join(fold_folder, 'test-items.dat'))
            write_ratings(candidate_items, os.path.join(fold_folder, 'candidate-items.dat'), print_line_length=False )

            print("Generating the new mult file...")
            self.generate_docs_terms(self.docs_vocabs, paperidx_paper_id_map_list, self.terms, fold_folder)

            # Write users and papers mappings to files:
            useridx_userhash = user_id_userhash_map_list[useridx_user_id_map_list]
            write_list_to_file([(j, i) for (i, j) in enumerate(useridx_userhash)], os.path.join(fold_folder, 'userhash_user_id_map.csv'), header=['citeulikeUserHash', 'user_id'], delimiter=",")

            paperidx_docid = paper_id_docid_map_list[paperidx_paper_id_map_list]
            write_list_to_file([(j, i) for (i, j) in enumerate(paperidx_docid)], os.path.join(fold_folder, 'citeulike_id_doc_id_map.csv'), header=['citeulikeId', 'paper_id'], delimiter=",")

            # For plotting:
            dates.append(d2)
            tr_rs.append(len(f1))
            tr_us.append(f1.user.nunique())
            tr_ps.append(f1.paper.nunique())
            ts_rs.append(len(f2))
            ts_us.append(f2.user.nunique())
            ts_ps.append(f2.paper.nunique())
            rat.append(len(f2) / len(f1) * 100)
            d1 = d2
        self.plot_split_lines(tr_rs, tr_us, tr_ps, ts_rs, ts_us, ts_ps, rat, dates)

        # Write split statistics to file:
        stats.save_stats_to_file(os.path.join(self.base_dir,'time-based_split_out-of-matrix' if self.out_of_matrix else 'time-based_split_in-matrix', 'stats.txt'))