Exemple #1
0
    def test__set_dataframe(self):
        empty_frame = pd.DataFrame()

        kf = KFoldPartitioning(n_splits=2)

        with self.assertRaises(PartitionError):
            kf.set_dataframe(empty_frame)
Exemple #2
0
    def test_iter(self):

        kf = KFoldPartitioning()

        kf.set_dataframe(original_frame)

        for train, test in kf:

            original_list = [
                list(row) for row in original_frame.itertuples(index=False)
            ]
            train_list = [list(row) for row in train.itertuples(index=False)]
            test_list = [list(row) for row in test.itertuples(index=False)]

            # Check that train and test are a partition
            train_not_in_test = [
                row for row in train_list if row not in test_list
            ]
            self.assertCountEqual(
                train_list, train_not_in_test)  # Count so regardless of order
            test_not_in_train = [
                row for row in test_list if row not in train_list
            ]
            self.assertCountEqual(
                test_list, test_not_in_train)  # Count so regardless of order

            # Check that the union of the two give the original data
            union_list = train_list + test_list
            self.assertCountEqual(original_list,
                                  union_list)  # Count so regardless of order
    def test_fit_graph_w_testrating_methodology(self):
        graph = NXFullGraph(ratings)

        rs = GraphBasedRS(NXPageRank(), graph)

        em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()])

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_split__single_kfold(self):
        user_ratings = pd.DataFrame.from_dict(
                        {'from_id': ["001", "001", "001", "001"],
                         'to_id': ["iphone", "ps4", "ps5", "xbox"],
                         'rating': [0.8, 0.7, -0.4, 1.0]})
        n_split = 2
        pm = PartitionModule(KFoldPartitioning(n_split))
        user_splits = pm._split_single(user_ratings)

        # No further tests since the partitioning technique is tested singularly
        self.assertEqual(len(user_splits), n_split)
    def test_split_all_kfold(self):
        all_ratings = pd.DataFrame(
                        {'from_id': ["001", "001", "001", "001", "002", "002", "002", "003", "003"],
                         'to_id': ["iphone", "ps4", "ps5", "xbox", "realme", "airpods", "ps4", "beats", "dvd"],
                         'rating': [0.8, 0.7, -0.4, 1.0, 0.8, 0.7, -0.4, 1.0, 0.65]})
        n_split = 2
        pm = PartitionModule(KFoldPartitioning(n_split))
        split_list = pm.split_all(all_ratings, set(all_ratings.from_id))

        # No further tests since the partitioning technique is tested singularly
        self.assertEqual(len(split_list), n_split)
    def test_fit_cb_w_testrating_methodology(self):
        rs = ContentBasedRS(
            CentroidVector(
                {"Plot": "tfidf"},
                CosineSimilarity(),
            ), ratings, items_dir)

        em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()])

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_all(self):
        ratings_filename = os.path.join(contents_path, '..', 'datasets',
                                        'examples', 'new_ratings.csv')

        ratings_frame = RatingsImporter(
            CSVFile(ratings_filename)).import_ratings()

        rs = ContentBasedRS(
            LinearPredictor(
                {"Plot": ['tfidf', 'embedding']},
                SkLinearRegression(),
            ), ratings_frame, items_dir)

        catalog = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[
                           Precision(sys_average='micro'),
                           PrecisionAtK(1, sys_average='micro'),
                           RPrecision(),
                           Recall(),
                           RecallAtK(3, ),
                           FMeasure(1, sys_average='macro'),
                           FMeasureAtK(2, beta=1, sys_average='micro'),
                           NDCG(),
                           NDCGAtK(3),
                           MRR(),
                           MRRAtK(5, ),
                           Correlation('pearson', top_n=5),
                           Correlation('kendall', top_n=3),
                           Correlation('spearman', top_n=4),
                           MAE(),
                           MSE(),
                           RMSE(),
                           CatalogCoverage(catalog),
                           CatalogCoverage(catalog, k=2),
                           CatalogCoverage(catalog, top_n=3),
                           GiniIndex(),
                           GiniIndex(top_n=3),
                           DeltaGap({
                               'primo': 0.5,
                               'secondo': 0.5
                           })
                       ],
                       methodology=TestItemsMethodology())

        result = em.fit()
Exemple #8
0
    def test_graph(self):
        catalog = set(ratings.to_id)

        users_dir = os.path.join(dir_test_files, 'complex_contents',
                                 'users_codified/')

        graph = NXFullGraph(
            ratings,
            user_contents_dir=users_dir,
            item_contents_dir=items_dir,
            item_exo_representation="dbpedia",
            user_exo_representation='local',
            item_exo_properties=['starring'],
            user_exo_properties=['1'
                                 ]  # It's the column in the users .DAT which
            # identifies the gender
        )

        graph_rs = GraphBasedRS(NXPageRank(), graph)

        em = EvalModel(graph_rs,
                       KFoldPartitioning(),
                       metric_list=[
                           Precision(relevant_threshold=3),
                           Recall(),
                           FMeasure(beta=1),
                           FMeasure(beta=2, sys_average='micro'),
                           MRR(),
                           Correlation('pearson'),
                           GiniIndex(),
                           DeltaGap({
                               'popular': 0.5,
                               'niche': 0.5
                           }),
                           PredictionCoverage(catalog),
                           PopProfileVsRecs(user_groups={
                               'popular': 0.5,
                               'niche': 0.5
                           },
                                            out_dir='plots/'),
                           LongTailDistr('plots/', format='svg'),
                           PopRecsCorrelation('plots/')
                       ],
                       verbose_predictions=True,
                       methodology=TestItemsMethodology())

        em.fit()
    def test_fit_graph_w_allitems_methodology(self):
        graph = NXFullGraph(ratings)

        rs = GraphBasedRS(NXPageRank(), graph)

        items = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[Precision()],
                       methodology=AllItemsMethodology(items))

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_fit_cb_w_allitems_methodology(self):
        rs = ContentBasedRS(
            CentroidVector(
                {"Plot": "tfidf"},
                CosineSimilarity(),
            ), ratings, items_dir)

        items = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[Precision()],
                       methodology=AllItemsMethodology(items))

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_all_skipping_user_exception(self):
        all_ratings = pd.DataFrame(
                        {'from_id': ["001", "001", "001", "001", "002", "002", "002", "003", "004", "004"],
                         'to_id': ["iphone", "ps4", "ps5", "xbox", "realme", "airpods", "ps4", "beats", "ps4", "ps5"],
                         'rating': [0.8, 0.7, -0.4, 1.0, 0.8, 0.7, -0.4, 1.0, 0.3, 0.6]})

        n_split = 2
        pm = PartitionModule(KFoldPartitioning(n_split))
        split_list = pm.split_all(all_ratings, set(all_ratings.from_id))

        # No further tests since the partitioning technique is tested singularly
        self.assertEqual(len(split_list), n_split)

        # Check that there are all users except 003 which is skipped since it has only 1 rating
        for split in split_list:
            self.assertIn('001', split.train['from_id'].values)
            self.assertIn('001', split.test['from_id'].values)
            self.assertIn('002', split.train['from_id'].values)
            self.assertIn('002', split.test['from_id'].values)
            self.assertNotIn('003', split.train['from_id'].values)
            self.assertNotIn('003', split.test['from_id'].values)
            self.assertIn('004', split.train['from_id'].values)
            self.assertIn('004', split.test['from_id'].values)