Esempio n. 1
0
    def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff):
        hits_by_user = self.hits_by_fold[fold]
        recommended_to_user = self._compute_recommended_in_fold(
            fold, input_cutoff)

        matrix = np.ndarray((len(recommended_to_user), output_cutoff))
        user_ids = []

        for user_index, (user_id, recommended_items) in enumerate(
                recommended_to_user.items()):
            user_ids.append(user_id)

            rank = 0
            for item_id in recommended_items:
                if item_id in hits_by_user[user_id]:
                    matrix[user_index, rank] = item_id
                    rank += 1
                    if rank == output_cutoff:
                        break
            else:
                for item_id in recommended_items:
                    if item_id not in hits_by_user[user_id]:
                        matrix[user_index, rank] = item_id
                        rank += 1
                        if rank == output_cutoff:
                            break

        ranking_set_id = dataset_io.RankingSetId(fold, 'MAPOracle')
        ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids)
        return ranking_set
Esempio n. 2
0
    def test_compute_optimal_ranking_set(self):
        ranking_set_u1 = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                               matrix=np.array([[1, 2, 3]]),
                                               user_ids=[1])

        ranking_set_by_id = {
            ranking_set_u1.id: ranking_set_u1,
        }

        rating_set = dataset_io.RatingSet(fold='u1')
        rating_set.base = pd.DataFrame.from_records(
            columns=['user_id', 'item_id', 'rating'], data=[])

        rating_set_by_fold = {'u1': rating_set}

        oracle = eild_oracle.EILDOracle(ranking_set_by_id, rating_set_by_fold)

        distance_matrix = np.array([[0, 0.5, 0.2], [0.5, 0, 0.9],
                                    [0.2, 0.9, 0]])
        oracle.distances_by_fold = {'u1': (distance_matrix, [1, 2, 3])}

        optimal_ranking_set = oracle.compute_optimal_ranking_set(
            'u1', input_cutoff=3, output_cutoff=2)

        self.assertEqual([1], optimal_ranking_set.user_ids)

        matrix = optimal_ranking_set.matrix
        self.assertEqual([2, 3], list(matrix[0, :]))
Esempio n. 3
0
  def test_compute_optimal_ranking_set(self):
    ranking_set_u1 = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u1', 'Alg'),
        matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]),
        user_ids=[1, 2])

    ranking_set_by_id = {
      ranking_set_u1.id: ranking_set_u1,
    }

    rating_set = dataset_io.RatingSet(fold='u1')
    rating_set.base = pd.DataFrame.from_records(
        columns=['user_id', 'item_id', 'rating'], data=[])

    rating_set_by_fold = {'u1': rating_set}

    oracle = epc_oracle.EPCOracle(ranking_set_by_id, rating_set_by_fold)

    oracle.popularity_by_fold = {'u1': {
      1: 1., 2: 0.25, 3: 0.5, 4: 0.75,
      5: 0.8, 6: 0.3, 7: 0.6, 8: 0.01
    }}

    optimal_ranking_set = oracle.compute_optimal_ranking_set(
      'u1', input_cutoff=4, output_cutoff=3)

    self.assertEqual([1, 2], optimal_ranking_set.user_ids)

    matrix = optimal_ranking_set.matrix
    self.assertEqual([2, 3, 4], list(matrix[0, :]))
    self.assertEqual([8, 6, 7], list(matrix[1, :]))
Esempio n. 4
0
    def test_ranking_with_single_item(self):
        ranking_matrix = np.array([[1]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[])

        self.assertAlmostEqual(0, self.eild.compute(ranking_set))
Esempio n. 5
0
    def test_ranking_with_item_not_in_ratings(self):
        item_ids = [1]
        distance_matrix = np.array([[1]])
        self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)}
        ranking_matrix = np.array([[1, 2]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[])

        self.assertAlmostEqual(0, self.eild.compute(ranking_set))
Esempio n. 6
0
    def test_returns_mean_for_many_rankings(self):
        hits_by_user = {1: {1}, 2: {}}
        self.map.hits_by_fold = {'u1': hits_by_user}

        ranking_matrix = np.array([[1], [1]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[1, 2])

        self.assertAlmostEqual(0.5, self.map.compute(ranking_set))
Esempio n. 7
0
    def test_ranking_with_single_miss(self):
        hits_by_user = {1: {1}}
        self.map.hits_by_fold = {'u1': hits_by_user}

        ranking_matrix = np.array([[1]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[1])

        self.assertAlmostEqual(1, self.map.compute(ranking_set))
Esempio n. 8
0
  def test_single_item(self):
    ranking_matrix = np.array([[1]])
    ranking_set = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u1', 'Alg'),
        matrix=ranking_matrix,
        user_ids=[])

    self.epc.popularity_by_fold = {'u1': pd.Series({1: 1})}

    value = self.epc.compute(ranking_set)

    self.assertAlmostEqual(0, value)
Esempio n. 9
0
  def test_returns_mean_for_many_rankings(self):
    ranking_matrix = np.array([[1], [2]])
    ranking_set = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u1', 'Alg'),
        matrix=ranking_matrix,
        user_ids=[])

    self.epc.popularity_by_fold = {'u1': pd.Series({1: 0.5, 2: 0.3})}

    value = self.epc.compute(ranking_set)

    expected_value = (0.5 + 0.7) / 2
    self.assertAlmostEqual(expected_value, value)
Esempio n. 10
0
  def test_many_items(self):
    ranking_matrix = np.array([[1, 2, 3]])
    ranking_set = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u1', 'Alg'),
        matrix=ranking_matrix,
        user_ids=[])

    self.epc.popularity_by_fold = {'u1': pd.Series({1: 0.5, 2: 0.3, 3: 0.1})}

    value = self.epc.compute(ranking_set)

    expected_value = (0.5 + 0.7 * 0.85 + 0.9 * 0.85**2) / (1 + 0.85 + 0.85**2)
    self.assertAlmostEqual(expected_value, value)
Esempio n. 11
0
    def test_complex_ranking_with_missing_hits(self):
        hits_by_user = {1: {1, 3, 4}}
        self.map.hits_by_fold = {'u1': hits_by_user}

        ranking_matrix = np.array([[1, 2, 5, 3]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[1])

        expected_map = (1 + 2 / 4) / 3

        self.assertAlmostEqual(expected_map, self.map.compute(ranking_set))
Esempio n. 12
0
  def test_compute_recommended_in_fold(self):
    ranking_set_u1 = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u1', 'Alg'),
        matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]),
        user_ids=[1, 2])

    ranking_set_u2 = dataset_io.RankingSet(
        id=dataset_io.RankingSetId('u2', 'Alg'),
        matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]),
        user_ids=[2, 3])

    ranking_set_by_id = {
      ranking_set_u1.id: ranking_set_u1,
      ranking_set_u2.id: ranking_set_u2,
    }

    oracle = oracle_utils.Oracle(
        ranking_set_by_id, rating_set_by_fold={})

    recommended_in_fold = oracle._compute_recommended_in_fold('u1', cutoff=3)

    expected = {1: {1, 2, 3}, 2: {5, 6, 7}}

    self.assertDictEqual(expected, recommended_in_fold)
Esempio n. 13
0
  def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff):
    distance_matrix, item_ids = self.distances_by_fold[fold]

    mean_distance_by_item = pd.Series(distance_matrix.sum(axis=0), index=item_ids)

    recommended_to_user = self._compute_recommended_in_fold(fold, input_cutoff)

    matrix = np.ndarray((len(recommended_to_user), output_cutoff))
    user_ids = []

    for i, (user_id, recommended_to_user) in enumerate(recommended_to_user.items()):
      user_ids.append(user_id)

      matrix[i, :] = heapq.nlargest(output_cutoff, recommended_to_user,
              lambda i: mean_distance_by_item.get(i, 0.))

    ranking_set_id = dataset_io.RankingSetId(fold, 'EILDOracle')
    ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids)
    return ranking_set
Esempio n. 14
0
    def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff):
        popularity_by_item = self.popularity_by_fold[fold]
        recommended_to_user = self._compute_recommended_in_fold(
            fold, input_cutoff)

        matrix = np.ndarray((len(recommended_to_user), output_cutoff))
        user_ids = []

        for i, (user_id,
                recommended_to_user) in enumerate(recommended_to_user.items()):
            user_ids.append(user_id)

            matrix[i, :] = heapq.nsmallest(
                output_cutoff, recommended_to_user,
                lambda i: popularity_by_item.get(i, 99999999.))

        ranking_set_id = dataset_io.RankingSetId(fold, 'EPCOracle')
        ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids)
        return ranking_set
Esempio n. 15
0
    def test_compute_optimal_ranking_set(self):
        ranking_set_u1 = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                               matrix=np.array([[1, 2, 3, 4],
                                                                [5, 6, 7, 8]]),
                                               user_ids=[1, 2])

        ranking_set_by_id = {
            ranking_set_u1.id: ranking_set_u1,
        }

        rating_set = dataset_io.RatingSet(fold='u1')
        rating_set.base = pd.DataFrame.from_records(
            columns=['user_id', 'item_id', 'rating'], data=[(4, 1, 5)])
        rating_set.test = pd.DataFrame.from_records(
            columns=['user_id', 'item_id', 'rating'],
            data=[
                (1, 3, 5),
                (1, 2, 5),
                (2, 1, 5),
                (2, 8, 5),
            ])
        rating_set_by_fold = {'u1': rating_set}

        oracle = map_oracle.MAPOracle(ranking_set_by_id, rating_set_by_fold)

        optimal_ranking_set = oracle.compute_optimal_ranking_set(
            'u1', input_cutoff=4, output_cutoff=3)

        self.assertEqual([1, 2], optimal_ranking_set.user_ids)

        matrix = optimal_ranking_set.matrix
        self.assertEqual(3, len(matrix[0, :]))
        self.assertSetEqual({2, 3}, set(matrix[0, 0:2]))
        self.assertTrue(set(matrix[0, 2:]).issubset({1, 4}))

        self.assertEqual(3, len(matrix[1, :]))
        self.assertSetEqual({8}, set(matrix[1, 0:1]))
        self.assertTrue(set(matrix[1, 1:]).issubset({5, 6, 7}))
Esempio n. 16
0
    def test_many_items(self):
        item_ids = [1, 2, 3]
        distance_matrix = np.array([
            [1, 2 / math.sqrt(6), 1 / math.sqrt(3)],
            [2 / math.sqrt(6), 1, 1 / math.sqrt(2)],
            [1 / math.sqrt(3), 1 / math.sqrt(2), 1],
        ])
        self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)}

        ranking_matrix = np.array([[1, 2, 3]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[])

        eild_k0 = (2 / math.sqrt(6) + 0.85 / math.sqrt(3)) / 1.85
        eild_k1 = (2 / math.sqrt(6) + 1 / math.sqrt(2)) / 2
        eild_k2 = (1 / math.sqrt(3) + 1 / math.sqrt(2)) / 2

        expected_eild = (eild_k0 + eild_k1 * 0.85 +
                         eild_k2 * 0.85**2) / (1 + 0.85 + 0.85**2)

        self.assertAlmostEqual(expected_eild, self.eild.compute(ranking_set))
Esempio n. 17
0
    def test_returns_mean_for_many_rankings(self):
        item_ids = [1, 2, 3]
        distance_matrix = np.array([
            [1, 2 / math.sqrt(6), 1 / math.sqrt(3)],
            [2 / math.sqrt(6), 1, 1 / math.sqrt(2)],
            [1 / math.sqrt(3), 1 / math.sqrt(2), 1],
        ])
        self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)}

        ranking_matrix = np.array([[1, 2], [2, 3]])
        ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId(
            'u1', 'Alg'),
                                            matrix=ranking_matrix,
                                            user_ids=[])

        eild_k0_u0 = eild_k1_u0 = 2 / math.sqrt(6)
        eild_u0 = (eild_k0_u0 + eild_k1_u0 * 0.85) / (1 + 0.85)

        eild_k0_u1 = eild_k1_u1 = 1 / math.sqrt(2)
        eild_u1 = (eild_k0_u1 + eild_k1_u1 * 0.85) / (1 + 0.85)

        expected_eild = (eild_u0 + eild_u1) / 2

        self.assertAlmostEqual(expected_eild, self.eild.compute(ranking_set))