def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff): hits_by_user = self.hits_by_fold[fold] recommended_to_user = self._compute_recommended_in_fold( fold, input_cutoff) matrix = np.ndarray((len(recommended_to_user), output_cutoff)) user_ids = [] for user_index, (user_id, recommended_items) in enumerate( recommended_to_user.items()): user_ids.append(user_id) rank = 0 for item_id in recommended_items: if item_id in hits_by_user[user_id]: matrix[user_index, rank] = item_id rank += 1 if rank == output_cutoff: break else: for item_id in recommended_items: if item_id not in hits_by_user[user_id]: matrix[user_index, rank] = item_id rank += 1 if rank == output_cutoff: break ranking_set_id = dataset_io.RankingSetId(fold, 'MAPOracle') ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids) return ranking_set
def test_compute_optimal_ranking_set(self): ranking_set_u1 = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=np.array([[1, 2, 3]]), user_ids=[1]) ranking_set_by_id = { ranking_set_u1.id: ranking_set_u1, } rating_set = dataset_io.RatingSet(fold='u1') rating_set.base = pd.DataFrame.from_records( columns=['user_id', 'item_id', 'rating'], data=[]) rating_set_by_fold = {'u1': rating_set} oracle = eild_oracle.EILDOracle(ranking_set_by_id, rating_set_by_fold) distance_matrix = np.array([[0, 0.5, 0.2], [0.5, 0, 0.9], [0.2, 0.9, 0]]) oracle.distances_by_fold = {'u1': (distance_matrix, [1, 2, 3])} optimal_ranking_set = oracle.compute_optimal_ranking_set( 'u1', input_cutoff=3, output_cutoff=2) self.assertEqual([1], optimal_ranking_set.user_ids) matrix = optimal_ranking_set.matrix self.assertEqual([2, 3], list(matrix[0, :]))
def test_compute_optimal_ranking_set(self): ranking_set_u1 = dataset_io.RankingSet( id=dataset_io.RankingSetId('u1', 'Alg'), matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), user_ids=[1, 2]) ranking_set_by_id = { ranking_set_u1.id: ranking_set_u1, } rating_set = dataset_io.RatingSet(fold='u1') rating_set.base = pd.DataFrame.from_records( columns=['user_id', 'item_id', 'rating'], data=[]) rating_set_by_fold = {'u1': rating_set} oracle = epc_oracle.EPCOracle(ranking_set_by_id, rating_set_by_fold) oracle.popularity_by_fold = {'u1': { 1: 1., 2: 0.25, 3: 0.5, 4: 0.75, 5: 0.8, 6: 0.3, 7: 0.6, 8: 0.01 }} optimal_ranking_set = oracle.compute_optimal_ranking_set( 'u1', input_cutoff=4, output_cutoff=3) self.assertEqual([1, 2], optimal_ranking_set.user_ids) matrix = optimal_ranking_set.matrix self.assertEqual([2, 3, 4], list(matrix[0, :])) self.assertEqual([8, 6, 7], list(matrix[1, :]))
def test_ranking_with_single_item(self): ranking_matrix = np.array([[1]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) self.assertAlmostEqual(0, self.eild.compute(ranking_set))
def test_ranking_with_item_not_in_ratings(self): item_ids = [1] distance_matrix = np.array([[1]]) self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)} ranking_matrix = np.array([[1, 2]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) self.assertAlmostEqual(0, self.eild.compute(ranking_set))
def test_returns_mean_for_many_rankings(self): hits_by_user = {1: {1}, 2: {}} self.map.hits_by_fold = {'u1': hits_by_user} ranking_matrix = np.array([[1], [1]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[1, 2]) self.assertAlmostEqual(0.5, self.map.compute(ranking_set))
def test_ranking_with_single_miss(self): hits_by_user = {1: {1}} self.map.hits_by_fold = {'u1': hits_by_user} ranking_matrix = np.array([[1]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[1]) self.assertAlmostEqual(1, self.map.compute(ranking_set))
def test_single_item(self): ranking_matrix = np.array([[1]]) ranking_set = dataset_io.RankingSet( id=dataset_io.RankingSetId('u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) self.epc.popularity_by_fold = {'u1': pd.Series({1: 1})} value = self.epc.compute(ranking_set) self.assertAlmostEqual(0, value)
def test_returns_mean_for_many_rankings(self): ranking_matrix = np.array([[1], [2]]) ranking_set = dataset_io.RankingSet( id=dataset_io.RankingSetId('u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) self.epc.popularity_by_fold = {'u1': pd.Series({1: 0.5, 2: 0.3})} value = self.epc.compute(ranking_set) expected_value = (0.5 + 0.7) / 2 self.assertAlmostEqual(expected_value, value)
def test_many_items(self): ranking_matrix = np.array([[1, 2, 3]]) ranking_set = dataset_io.RankingSet( id=dataset_io.RankingSetId('u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) self.epc.popularity_by_fold = {'u1': pd.Series({1: 0.5, 2: 0.3, 3: 0.1})} value = self.epc.compute(ranking_set) expected_value = (0.5 + 0.7 * 0.85 + 0.9 * 0.85**2) / (1 + 0.85 + 0.85**2) self.assertAlmostEqual(expected_value, value)
def test_complex_ranking_with_missing_hits(self): hits_by_user = {1: {1, 3, 4}} self.map.hits_by_fold = {'u1': hits_by_user} ranking_matrix = np.array([[1, 2, 5, 3]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[1]) expected_map = (1 + 2 / 4) / 3 self.assertAlmostEqual(expected_map, self.map.compute(ranking_set))
def test_compute_recommended_in_fold(self): ranking_set_u1 = dataset_io.RankingSet( id=dataset_io.RankingSetId('u1', 'Alg'), matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), user_ids=[1, 2]) ranking_set_u2 = dataset_io.RankingSet( id=dataset_io.RankingSetId('u2', 'Alg'), matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), user_ids=[2, 3]) ranking_set_by_id = { ranking_set_u1.id: ranking_set_u1, ranking_set_u2.id: ranking_set_u2, } oracle = oracle_utils.Oracle( ranking_set_by_id, rating_set_by_fold={}) recommended_in_fold = oracle._compute_recommended_in_fold('u1', cutoff=3) expected = {1: {1, 2, 3}, 2: {5, 6, 7}} self.assertDictEqual(expected, recommended_in_fold)
def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff): distance_matrix, item_ids = self.distances_by_fold[fold] mean_distance_by_item = pd.Series(distance_matrix.sum(axis=0), index=item_ids) recommended_to_user = self._compute_recommended_in_fold(fold, input_cutoff) matrix = np.ndarray((len(recommended_to_user), output_cutoff)) user_ids = [] for i, (user_id, recommended_to_user) in enumerate(recommended_to_user.items()): user_ids.append(user_id) matrix[i, :] = heapq.nlargest(output_cutoff, recommended_to_user, lambda i: mean_distance_by_item.get(i, 0.)) ranking_set_id = dataset_io.RankingSetId(fold, 'EILDOracle') ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids) return ranking_set
def compute_optimal_ranking_set(self, fold, input_cutoff, output_cutoff): popularity_by_item = self.popularity_by_fold[fold] recommended_to_user = self._compute_recommended_in_fold( fold, input_cutoff) matrix = np.ndarray((len(recommended_to_user), output_cutoff)) user_ids = [] for i, (user_id, recommended_to_user) in enumerate(recommended_to_user.items()): user_ids.append(user_id) matrix[i, :] = heapq.nsmallest( output_cutoff, recommended_to_user, lambda i: popularity_by_item.get(i, 99999999.)) ranking_set_id = dataset_io.RankingSetId(fold, 'EPCOracle') ranking_set = dataset_io.RankingSet(ranking_set_id, matrix, user_ids) return ranking_set
def test_compute_optimal_ranking_set(self): ranking_set_u1 = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), user_ids=[1, 2]) ranking_set_by_id = { ranking_set_u1.id: ranking_set_u1, } rating_set = dataset_io.RatingSet(fold='u1') rating_set.base = pd.DataFrame.from_records( columns=['user_id', 'item_id', 'rating'], data=[(4, 1, 5)]) rating_set.test = pd.DataFrame.from_records( columns=['user_id', 'item_id', 'rating'], data=[ (1, 3, 5), (1, 2, 5), (2, 1, 5), (2, 8, 5), ]) rating_set_by_fold = {'u1': rating_set} oracle = map_oracle.MAPOracle(ranking_set_by_id, rating_set_by_fold) optimal_ranking_set = oracle.compute_optimal_ranking_set( 'u1', input_cutoff=4, output_cutoff=3) self.assertEqual([1, 2], optimal_ranking_set.user_ids) matrix = optimal_ranking_set.matrix self.assertEqual(3, len(matrix[0, :])) self.assertSetEqual({2, 3}, set(matrix[0, 0:2])) self.assertTrue(set(matrix[0, 2:]).issubset({1, 4})) self.assertEqual(3, len(matrix[1, :])) self.assertSetEqual({8}, set(matrix[1, 0:1])) self.assertTrue(set(matrix[1, 1:]).issubset({5, 6, 7}))
def test_many_items(self): item_ids = [1, 2, 3] distance_matrix = np.array([ [1, 2 / math.sqrt(6), 1 / math.sqrt(3)], [2 / math.sqrt(6), 1, 1 / math.sqrt(2)], [1 / math.sqrt(3), 1 / math.sqrt(2), 1], ]) self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)} ranking_matrix = np.array([[1, 2, 3]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) eild_k0 = (2 / math.sqrt(6) + 0.85 / math.sqrt(3)) / 1.85 eild_k1 = (2 / math.sqrt(6) + 1 / math.sqrt(2)) / 2 eild_k2 = (1 / math.sqrt(3) + 1 / math.sqrt(2)) / 2 expected_eild = (eild_k0 + eild_k1 * 0.85 + eild_k2 * 0.85**2) / (1 + 0.85 + 0.85**2) self.assertAlmostEqual(expected_eild, self.eild.compute(ranking_set))
def test_returns_mean_for_many_rankings(self): item_ids = [1, 2, 3] distance_matrix = np.array([ [1, 2 / math.sqrt(6), 1 / math.sqrt(3)], [2 / math.sqrt(6), 1, 1 / math.sqrt(2)], [1 / math.sqrt(3), 1 / math.sqrt(2), 1], ]) self.eild.distances_by_fold = {'u1': (distance_matrix, item_ids)} ranking_matrix = np.array([[1, 2], [2, 3]]) ranking_set = dataset_io.RankingSet(id=dataset_io.RankingSetId( 'u1', 'Alg'), matrix=ranking_matrix, user_ids=[]) eild_k0_u0 = eild_k1_u0 = 2 / math.sqrt(6) eild_u0 = (eild_k0_u0 + eild_k1_u0 * 0.85) / (1 + 0.85) eild_k0_u1 = eild_k1_u1 = 1 / math.sqrt(2) eild_u1 = (eild_k0_u1 + eild_k1_u1 * 0.85) / (1 + 0.85) expected_eild = (eild_u0 + eild_u1) / 2 self.assertAlmostEqual(expected_eild, self.eild.compute(ranking_set))