Ejemplo n.º 1
0
    def test_index_query(self):
        movies_index = os.path.join(dir_test_files, 'complex_contents',
                                    'index/')
        filter_list = ['tt0114319', 'tt0114388']
        recs_number = 3

        # Test prediction and ranking with the Index Query algorithm
        alg = IndexQuery({'Plot': ['index_original', 'index_preprocessed']})
        rs = ContentBasedRS(alg, ratings, movies_index)

        # Prediction should raise error since it's not a ScorePredictionAlg
        with self.assertRaises(NotPredictionAlg):
            rs.fit_predict('A000')

        result_rank = rs.fit_rank('A000')
        self.assertGreater(len(result_rank), 0)

        # Test prediction and ranking with the IndexQuery algorithm on specified items, prediction will raise exception
        # since it's not a PredictionAlgorithm
        with self.assertRaises(NotPredictionAlg):
            rs.fit_predict('A000', filter_list=filter_list)

        result_rank_filtered = rs.fit_rank('A000', filter_list=filter_list)
        self.assertGreater(len(result_rank_filtered), 0)

        # Test top-n ranking with the IndexQuery algorithm
        result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number)
        self.assertEqual(len(result_rank_numbered), recs_number)
Ejemplo n.º 2
0
    def test_linear_predictor(self):
        recs_number = 3

        # Test prediction and ranking with the Classifier Recommender algorithm
        alg = LinearPredictor({'Plot': ['tfidf', 'embedding']},
                              SkLinearRegression())
        rs = ContentBasedRS(alg, ratings, self.movies_multiple)

        # Prediction
        result_pred_filtered = rs.fit_predict('A000',
                                              filter_list=self.filter_list)
        self.assertEqual(len(result_pred_filtered), len(self.filter_list))

        # Test ranking with the Classifier Recommender algorithm on specified items
        result_rank_filtered = rs.fit_rank('A000',
                                           filter_list=self.filter_list)
        self.assertEqual(len(result_rank_filtered), len(self.filter_list))

        # Test top-n ranking with the Classifier Recommender algorithm
        result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number)
        self.assertEqual(len(result_rank_numbered), recs_number)
Ejemplo n.º 3
0
    def test_fit_cb_w_testrating_methodology(self):
        rs = ContentBasedRS(
            CentroidVector(
                {"Plot": "tfidf"},
                CosineSimilarity(),
            ), ratings, items_dir)

        em = EvalModel(rs, KFoldPartitioning(), metric_list=[Precision()])

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
Ejemplo n.º 4
0
    def test_all(self):
        ratings_filename = os.path.join(contents_path, '..', 'datasets',
                                        'examples', 'new_ratings.csv')

        ratings_frame = RatingsImporter(
            CSVFile(ratings_filename)).import_ratings()

        rs = ContentBasedRS(
            LinearPredictor(
                {"Plot": ['tfidf', 'embedding']},
                SkLinearRegression(),
            ), ratings_frame, items_dir)

        catalog = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[
                           Precision(sys_average='micro'),
                           PrecisionAtK(1, sys_average='micro'),
                           RPrecision(),
                           Recall(),
                           RecallAtK(3, ),
                           FMeasure(1, sys_average='macro'),
                           FMeasureAtK(2, beta=1, sys_average='micro'),
                           NDCG(),
                           NDCGAtK(3),
                           MRR(),
                           MRRAtK(5, ),
                           Correlation('pearson', top_n=5),
                           Correlation('kendall', top_n=3),
                           Correlation('spearman', top_n=4),
                           MAE(),
                           MSE(),
                           RMSE(),
                           CatalogCoverage(catalog),
                           CatalogCoverage(catalog, k=2),
                           CatalogCoverage(catalog, top_n=3),
                           GiniIndex(),
                           GiniIndex(top_n=3),
                           DeltaGap({
                               'primo': 0.5,
                               'secondo': 0.5
                           })
                       ],
                       methodology=TestItemsMethodology())

        result = em.fit()
Ejemplo n.º 5
0
    def test_multiple(self):
        recs_number = 3
        user_id_list = ['A000', 'A001']

        alg = LinearPredictor({'Plot': ['tfidf', 'embedding']},
                              SkLinearRegression())
        rs = ContentBasedRS(alg, ratings, self.movies_multiple)

        # Prediction
        result_pred_filtered = rs.multiple_fit_predict(
            user_id_list, filter_list=self.filter_list)
        self.assertEqual(set(user_id_list),
                         set(result_pred_filtered['from_id']))
        for user in user_id_list:
            self.assertEqual(
                len(result_pred_filtered.query('from_id == @user')),
                len(self.filter_list))

        # Test ranking with the Classifier Recommender algorithm on specified items
        result_rank_filtered = rs.multiple_fit_rank(
            user_id_list, filter_list=self.filter_list)
        self.assertEqual(set(user_id_list),
                         set(result_rank_filtered['from_id']))
        for user in user_id_list:
            self.assertEqual(
                len(result_rank_filtered.query('from_id == @user')),
                len(self.filter_list))

        # Test top-n ranking with the Classifier Recommender algorithm
        result_rank_numbered = rs.multiple_fit_rank(user_id_list,
                                                    recs_number=recs_number)
        self.assertEqual(set(user_id_list),
                         set(result_rank_numbered['from_id']))
        for user in user_id_list:
            self.assertEqual(
                len(result_rank_numbered.query('from_id == @user')),
                recs_number)
    def test_calc_scores_content_based(self):
        recsys = ContentBasedRS(
            LinearPredictor({'Plot': 'tfidf'}, SkLinearRegression()),
            self.ratings_original, movies_dir)

        # We just need a Metric of the ScoresNeededMetric class to test
        metric_list = [MAE()]

        valid_metric = PredictionCalculator(self.split_list,
                                            recsys).calc_predictions(
                                                self.test_items_list,
                                                metric_list)
        score_truth = ScoresNeededMetric.score_truth_list

        # We expect this to be empty, since there are no RankingNeededMetric in the metric list
        rank_truth = RankingNeededMetric.rank_truth_list

        self.assertEqual(valid_metric, metric_list)
        self.assertGreater(len(score_truth), 0)
        self.assertEqual(len(rank_truth), 0)
    def test_pop_invalid_metric(self):
        recsys = ContentBasedRS(
            ClassifierRecommender({'Plot': 'tfidf'}, SkKNN(), threshold=3),
            self.ratings_original, movies_dir)

        # Tries to calc score predictions with a pure ranking algorithm
        metric_list = [MAE()]

        valid_metric = PredictionCalculator(self.split_list,
                                            recsys).calc_predictions(
                                                self.test_items_list,
                                                metric_list)
        score_truth = ScoresNeededMetric.score_truth_list
        rank_truth = RankingNeededMetric.rank_truth_list

        # The metric is excluded from the valid ones and nothing is calculated since
        # there aren't any others
        self.assertEqual(len(valid_metric), 0)
        self.assertEqual(len(score_truth), 0)
        self.assertEqual(len(rank_truth), 0)

        # Tries to calc score predictions with a pure ranking algorithm but there are also
        # other type of metrics
        metric_ranking = NDCG()
        metric_score = MAE()
        metric_list = [metric_score, metric_ranking]

        valid_metric = PredictionCalculator(self.split_list,
                                            recsys).calc_predictions(
                                                self.test_items_list,
                                                metric_list)
        score_truth = ScoresNeededMetric.score_truth_list
        rank_truth = RankingNeededMetric.rank_truth_list

        # The metric MAE is excluded from the valid ones but NDCG is valid so predictions
        # for that metric (RankingNeededMetric) are calculated
        self.assertIn(metric_ranking, valid_metric)
        self.assertNotIn(metric_score, valid_metric)

        self.assertEqual(len(score_truth), 0)
        self.assertGreater(len(rank_truth), 0)
Ejemplo n.º 8
0
    def test_fit_cb_w_allitems_methodology(self):
        rs = ContentBasedRS(
            CentroidVector(
                {"Plot": "tfidf"},
                CosineSimilarity(),
            ), ratings, items_dir)

        items = set([
            os.path.splitext(f)[0] for f in os.listdir(items_dir)
            if os.path.isfile(os.path.join(items_dir, f)) and f.endswith('xz')
        ])

        em = EvalModel(rs,
                       KFoldPartitioning(),
                       metric_list=[Precision()],
                       methodology=AllItemsMethodology(items))

        sys_result, users_result = em.fit()

        self.assertIsInstance(sys_result, pd.DataFrame)
        self.assertIsInstance(users_result, pd.DataFrame)
    def test_calc_rank_content_based(self):

        recsys = ContentBasedRS(
            ClassifierRecommender({'Plot': 'tfidf'}, SkKNN(), threshold=3),
            self.ratings_original, movies_dir)

        # We just need a Metric of the RankingNeededMetric class to test
        metric_list = [NDCG()]

        valid_metric = PredictionCalculator(self.split_list,
                                            recsys).calc_predictions(
                                                self.test_items_list,
                                                metric_list)
        rank_truth = RankingNeededMetric.rank_truth_list

        # We expect this to be empty, since there are no ScoresNeededMetric in the metric list
        score_truth = ScoresNeededMetric.score_truth_list

        self.assertEqual(valid_metric, metric_list)
        self.assertGreater(len(rank_truth), 0)
        self.assertEqual(len(score_truth), 0)
Ejemplo n.º 10
0
    def test_classifier_recommender(self):
        recs_number = 3

        # Test prediction and ranking with the Classifier Recommender algorithm
        alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']}, SkSVC())
        rs = ContentBasedRS(alg, ratings, self.movies_multiple)

        # Prediction should raise error since it's not a ScorePredictionAlg
        with self.assertRaises(NotPredictionAlg):
            rs.fit_predict('A000')

        # Test ranking with the Classifier Recommender algorithm on specified items
        result_rank_filtered = rs.fit_rank('A000',
                                           filter_list=self.filter_list)
        self.assertEqual(len(result_rank_filtered), len(self.filter_list))

        # Test top-n ranking with the Classifier Recommender algorithm
        result_rank_numbered = rs.fit_rank('A000', recs_number=recs_number)
        self.assertEqual(len(result_rank_numbered), recs_number)
Ejemplo n.º 11
0
    def test_empty_frame(self):
        ratings_only_positive = pd.DataFrame.from_records(
            [("A000", "tt0114576", 5, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        ratings_only_negative = pd.DataFrame.from_records(
            [("A000", "tt0114576", 1, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        ratings_item_inexistent = pd.DataFrame.from_records(
            [("A000", "not exists", 1, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        # ClassifierRecommender returns an empty frame
        alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']},
                                    SkSVC(),
                                    threshold=3)
        rs = ContentBasedRS(alg, ratings_only_positive, self.movies_multiple)
        result = rs.fit_rank('A000')
        self.assertTrue(result.empty)

        alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']},
                                    SkSVC(),
                                    threshold=3)
        rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple)
        result = rs.fit_rank('A000')
        self.assertTrue(result.empty)

        alg = ClassifierRecommender({'Plot': ['tfidf', 'embedding']},
                                    SkSVC(),
                                    threshold=3)
        rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple)
        result = rs.fit_rank('A000')
        self.assertTrue(result.empty)

        # CentroidVector returns an empty frame
        alg = CentroidVector({'Plot': ['tfidf', 'embedding']},
                             CosineSimilarity(),
                             threshold=3)
        rs = ContentBasedRS(alg, ratings_only_negative, self.movies_multiple)
        result = rs.fit_rank('A000')
        self.assertTrue(result.empty)

        alg = CentroidVector({'Plot': ['tfidf', 'embedding']},
                             CosineSimilarity(),
                             threshold=3)
        rs = ContentBasedRS(alg, ratings_item_inexistent, self.movies_multiple)
        result = rs.fit_rank('A000')
        self.assertTrue(result.empty)