Beispiel #1
0
    def test_table_add_column(self):
        table = Table("TableA")
        column_1 = Column("ColA")
        table.add_column(column_1)

        self.assertEqual(table.columns, [column_1])
        self.assertEqual(column_1.table, table)

        column_2 = Column("ColB")
        column_3 = Column("ColC")
        table.add_columns([column_2, column_3])

        self.assertEqual(table.columns, [column_1, column_2, column_3])
        self.assertEqual(column_2.table, table)
        self.assertEqual(column_3.table, table)
Beispiel #2
0
    def test_merge(self):
        index_0 = Index([self.column_0])
        index_1 = Index([self.column_1])
        result = index_merge(index_0, index_1)
        expected = Index([self.column_0, self.column_1])
        self.assertEqual(result, expected)

        index_0 = Index([self.column_0, self.column_1])
        index_1 = Index([self.column_1, self.column_2])
        result = index_merge(index_0, index_1)
        expected = Index([self.column_0, self.column_1, self.column_2])
        self.assertEqual(result, expected)

        index_0 = Index([self.column_0, self.column_1])
        index_1 = Index([self.column_1, self.column_0])
        result = index_merge(index_0, index_1)
        expected = Index([self.column_0, self.column_1])
        self.assertEqual(result, expected)

        # Example from Bruno's paper
        column_a = Column("a")
        column_b = Column("b")
        column_c = Column("c")
        column_d = Column("d")
        column_e = Column("e")
        column_f = Column("f")
        column_g = Column("g")

        columns = [
            column_a, column_b, column_c, column_d, column_e, column_f,
            column_g
        ]
        table = Table("TableB")
        table.add_columns(columns)

        index_1 = Index(
            [column_a, column_b, column_c, column_d, column_e, column_f])
        index_2 = Index([column_c, column_d, column_g, column_e])
        result = index_merge(index_1, index_2)
        expected = Index([
            column_a, column_b, column_c, column_d, column_e, column_f,
            column_g
        ])
        self.assertEqual(result, expected)
Beispiel #3
0
class TestIBMAlgorithm(unittest.TestCase):
    def setUp(self):
        self.connector = MockConnector()
        self.algo = IBMAlgorithm(database_connector=self.connector)

        self.column_0 = Column("Col0")
        self.column_1 = Column("Col1")
        self.column_2 = Column("Col2")
        self.column_3 = Column("Col3")
        self.column_4 = Column("Col4")
        self.column_5 = Column("Col5")
        self.column_6 = Column("Col6")
        self.column_7 = Column("Col7")
        self.all_columns = [
            self.column_0,
            self.column_1,
            self.column_2,
            self.column_3,
            self.column_4,
            self.column_5,
            self.column_6,
            self.column_7,
        ]

        self.table = Table("Table0")
        self.table.add_columns(self.all_columns)

        self.query_0 = Query(
            0,
            "SELECT * FROM Table0 WHERE Col0 = 1 AND Col1 = 2 AND Col2 = 3;",
            self.all_columns,
        )
        # query_1 = Query(1, 'SELECT * FROM TableA WHERE ColA = 4;', [self.column_0])

    def test_ibm_algorithm(self):
        # Should use default parameters if none are specified
        budget_in_mb = 500
        self.assertEqual(self.algo.disk_constraint, budget_in_mb * MB_TO_BYTES)
        self.assertEqual(self.algo.cost_evaluation.cost_estimation, "whatif")
        self.assertEqual(self.algo.seconds_limit, 10)
        self.assertEqual(self.algo.maximum_remove, 4)

    def test_possible_indexes(self):
        column_0_table_1 = Column("Col0")
        table_1 = Table("Table1")
        table_1.add_column(column_0_table_1)
        query = Query(
            17,
            """SELECT * FROM Table0 as t0, Table1 as t1 WHERE t0.Col0 = 1"
                AND t0.Col1 = 2 AND t0.Col2 = 3 AND t1.Col0 = 17;""",
            [self.column_0, self.column_1, self.column_2, column_0_table_1],
        )
        indexes = self.algo._possible_indexes(query)
        self.assertIn(Index([column_0_table_1]), indexes)
        self.assertIn(Index([self.column_0]), indexes)
        self.assertIn(Index([self.column_1]), indexes)
        self.assertIn(Index([self.column_2]), indexes)
        self.assertIn(Index([self.column_0, self.column_1]), indexes)
        self.assertIn(Index([self.column_0, self.column_2]), indexes)
        self.assertIn(Index([self.column_1, self.column_0]), indexes)
        self.assertIn(Index([self.column_1, self.column_2]), indexes)
        self.assertIn(Index([self.column_2, self.column_0]), indexes)
        self.assertIn(Index([self.column_2, self.column_1]), indexes)
        self.assertIn(Index([self.column_0, self.column_1, self.column_2]),
                      indexes)
        self.assertIn(Index([self.column_0, self.column_2, self.column_1]),
                      indexes)
        self.assertIn(Index([self.column_1, self.column_0, self.column_2]),
                      indexes)
        self.assertIn(Index([self.column_1, self.column_2, self.column_0]),
                      indexes)
        self.assertIn(Index([self.column_2, self.column_0, self.column_1]),
                      indexes)
        self.assertIn(Index([self.column_2, self.column_1, self.column_0]),
                      indexes)

    def test_recommended_indexes(self):
        def _simulate_index_mock(index, store_size):
            index.hypopg_name = f"<1337>btree_{index.columns}"

        # For some reason, the database decides to only use an index for one of
        # the filters
        def _simulate_get_plan(query):
            plan = {
                "Total Cost":
                17,
                "Plans": [{
                    "Index Name": "<1337>btree_(C table0.col1,)",
                    "Filter": "(Col0 = 1)",
                }],
            }

            return plan

        query = Query(
            17,
            "SELECT * FROM Table0 WHERE Col0 = 1 AND Col1 = 2;",
            [self.column_0, self.column_1],
        )

        self.algo.database_connector.get_plan = MagicMock(
            side_effect=_simulate_get_plan)
        self.algo.what_if.simulate_index = MagicMock(
            side_effect=_simulate_index_mock)
        self.algo.what_if.drop_all_simulated_indexes = MagicMock()

        indexes, cost = self.algo._recommended_indexes(query)
        self.assertEqual(cost, 17)
        self.assertEqual(indexes, {Index([self.column_1])})

        self.assertEqual(self.algo.what_if.simulate_index.call_count, 4)
        self.algo.what_if.drop_all_simulated_indexes.assert_called_once()
        self.algo.database_connector.get_plan.assert_called_once_with(query)

    def test_exploit_virtual_indexes(self):
        def _simulate_index_mock(index, store_size):
            index.hypopg_name = f"<1337>btree_{index.columns}"

        # For some reason, the database decides to only use an index for one of
        # the filters
        def _simulate_get_plan(query):
            if "Table0" in query.text:
                return {
                    "Total Cost": 17,
                    "Plans": [{
                        "Index Name": "<1337>btree_(C table0.col1,)"
                    }],
                }

            return {
                "Total Cost": 5,
                "Plans": [{
                    "Simple Table Retrieve": "table1"
                }]
            }

        query_0 = Query(
            0,
            "SELECT * FROM Table0 WHERE Col0 = 1 AND Col1 = 2;",
            [self.column_0, self.column_1],
        )
        query_1 = Query(1, "SELECT * FROM Table1;", [])
        workload = Workload([query_0, query_1], "database_name")

        self.algo.database_connector.get_plan = MagicMock(
            side_effect=_simulate_get_plan)
        self.algo.what_if.simulate_index = MagicMock(
            side_effect=_simulate_index_mock)
        self.algo.what_if.drop_all_simulated_indexes = MagicMock()
        query_results, index_candidates = self.algo._exploit_virtual_indexes(
            workload)
        self.assertEqual(len(query_results), len(workload.queries))
        expected_first_result = {
            "cost_without_indexes": 17,
            "cost_with_recommended_indexes": 17,
            "recommended_indexes": set([Index([self.column_1])]),
        }
        expected_second_result = {
            "cost_without_indexes": 5,
            "cost_with_recommended_indexes": 5,
            "recommended_indexes": set(),
        }
        self.assertEqual(query_results[query_0], expected_first_result)
        self.assertEqual(query_results[query_1], expected_second_result)
        self.assertEqual(index_candidates, set([Index([self.column_1])]))

    def test_calculate_index_benefits(self):
        index_0 = Index([self.column_0])
        index_0.estimated_size = 5
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1
        index_2 = Index([self.column_2])
        index_2.estimated_size = 3

        query_result_0 = {
            "cost_without_indexes": 100,
            "cost_with_recommended_indexes": 50,
            "recommended_indexes": [index_0, index_1],
        }
        # Yes, negative benefit is possible
        query_result_1 = {
            "cost_without_indexes": 50,
            "cost_with_recommended_indexes": 60,
            "recommended_indexes": [index_1],
        }
        query_result_2 = {
            "cost_without_indexes": 60,
            "cost_with_recommended_indexes": 57,
            "recommended_indexes": [index_2],
        }
        query_result_3 = {
            "cost_without_indexes": 60,
            "cost_with_recommended_indexes": 60,
            "recommended_indexes": [],
        }
        query_results = {
            "q0": query_result_0,
            "q1": query_result_1,
            "q2": query_result_2,
            "q3": query_result_3,
        }

        index_benefits = self.algo._calculate_index_benefits(
            [index_0, index_1, index_2], query_results)
        expected_index_benefits = [
            IndexBenefit(index_1, 40),
            IndexBenefit(index_0, 50),
            IndexBenefit(index_2, 3),
        ]

        self.assertEqual(index_benefits, expected_index_benefits)

    def test_combine_subsumed(self):
        index_0_1 = Index([self.column_0, self.column_1])
        index_0_1.estimated_size = 2
        index_0 = Index([self.column_0])
        index_0.estimated_size = 1
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1

        # Scenario 1. Index subsumed because better ratio for larger index
        index_benefits = [
            IndexBenefit(index_0_1, 21),
            IndexBenefit(index_0, 10)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1, 31)]
        self.assertEqual(subsumed, expected)

        # Scenario 2. Index not subsumed because better index has fewer attributes
        index_benefits = [
            IndexBenefit(index_0, 11),
            IndexBenefit(index_0_1, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0, 11), IndexBenefit(index_0_1, 20)]
        self.assertEqual(subsumed, expected)

        # Scenario 3. Index not subsumed because last element does not match
        # attribute even though better ratio
        index_0_1_2 = Index([self.column_0, self.column_1, self.column_2])
        index_0_1_2.estimated_size = 3
        index_0_2 = Index([self.column_0, self.column_2])
        index_0_2.estimated_size = 2

        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_2, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 31), IndexBenefit(index_0_2, 20)]
        self.assertEqual(subsumed, expected)

        # Scenario 4. Multi Index subsumed
        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_1, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 51)]
        self.assertEqual(subsumed, expected)

        # Scenario 5. Multiple Indexes subsumed
        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_1, 20),
            IndexBenefit(index_0, 10),
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 61)]
        self.assertEqual(subsumed, expected)

        # Scenario 6. Input returned if len(input) < 2
        subsumed = self.algo._combine_subsumed([IndexBenefit(index_0_1, 21)])
        expected = [IndexBenefit(index_0_1, 21)]
        self.assertEqual(subsumed, expected)

        # Scenario 7. Input not sorted by ratio throws
        with self.assertRaises(AssertionError):
            subsumed = self.algo._combine_subsumed(
                [IndexBenefit(index_0, 10),
                 IndexBenefit(index_0_1, 21)])

    def test_evaluate_workload(self):
        index_0 = Index([self.column_0])
        index_1 = Index([self.column_1])
        self.algo.cost_evaluation.calculate_cost = MagicMock()

        self.algo._evaluate_workload(
            [IndexBenefit(index_0, 10),
             IndexBenefit(index_1, 9)], workload=[])
        self.algo.cost_evaluation.calculate_cost.assert_called_once_with(
            [], [index_0, index_1])

    def test_try_variations_time_limit(self):
        index_0 = Index([self.column_0])
        index_0.estimated_size = 1
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1
        index_2 = Index([self.column_2])
        index_2.estimated_size = 1
        index_3 = Index([self.column_3])
        index_3.estimated_size = 1
        index_4 = Index([self.column_4])
        index_4.estimated_size = 1
        index_5 = Index([self.column_5])
        index_5.estimated_size = 1
        index_6 = Index([self.column_6])
        index_6.estimated_size = 1
        index_7 = Index([self.column_7])
        index_7.estimated_size = 5
        self.algo.cost_evaluation.calculate_cost = MagicMock(return_value=17)
        self.algo.seconds_limit = 0.2

        time_before = time.time()
        self.algo._try_variations(
            selected_index_benefits=frozenset([IndexBenefit(index_0, 1)]),
            index_benefits=frozenset([IndexBenefit(index_1, 1)]),
            workload=[],
        )
        self.assertGreaterEqual(time.time(),
                                time_before + self.algo.seconds_limit)

        def fake(selected, workload):
            cost = 10
            if IndexBenefit(index_3, 1.5) in selected:
                cost -= 0.5
            if IndexBenefit(index_4, 0.5) in selected:
                cost += 0.5
            if IndexBenefit(index_1, 0.5) in selected:
                cost += 0.5
            if IndexBenefit(index_1, 0.5) in selected:
                cost += 0.5
            return cost

        # In this scenario a good index has not been selected (index_3).
        # We test three things:
        # (i)   That index_3 gets chosen by variation.
        # (ii)  That the weakest index from the original selection gets
        #         removed (index_1).
        # (iii) That index_4 does not get chosen even though it is better than index_1.
        self.algo._evaluate_workload = fake
        self.algo.maximum_remove = 1
        self.algo.disk_constraint = 3
        new = self.algo._try_variations(
            selected_index_benefits=frozenset([
                IndexBenefit(index_0, 1),
                IndexBenefit(index_1, 0.5),
                IndexBenefit(index_2, 1),
            ]),
            index_benefits=frozenset([
                IndexBenefit(index_0, 1),
                IndexBenefit(index_1, 0.5),
                IndexBenefit(index_2, 1),
                IndexBenefit(index_3, 1.5),
                IndexBenefit(index_4, 0.6),
            ]),
            workload=[],
        )
        self.assertIn(IndexBenefit(index_3, 1.5), new)
        self.assertNotIn(IndexBenefit(index_4, 0.5), new)
        self.assertNotIn(IndexBenefit(index_1, 0.5), new)

        # Test that good index is not chosen because of storage restrictions
        new = self.algo._try_variations(
            selected_index_benefits=frozenset([IndexBenefit(index_0, 1)]),
            index_benefits=frozenset(
                [IndexBenefit(index_0, 1),
                 IndexBenefit(index_7, 5)]),
            workload=[],
        )
        self.assertEqual(new, set([IndexBenefit(index_0, 1)]))
Beispiel #4
0
    def test_split(self):
        # If there are no common columns, index splits are undefined
        index_0 = Index([self.column_0])
        index_1 = Index([self.column_1])
        result = index_split(index_0, index_1)
        expected = None
        self.assertEqual(result, expected)

        index_0 = Index([self.column_0, self.column_1])
        index_1 = Index([self.column_1])
        result = index_split(index_0, index_1)
        common_column_index = Index([self.column_1])
        residual_column_index_0 = Index([self.column_0])
        expected = {common_column_index, residual_column_index_0}
        self.assertEqual(result, expected)

        index_0 = Index([self.column_1])
        index_1 = Index([self.column_1, self.column_2])
        result = index_split(index_0, index_1)
        common_column_index = Index([self.column_1])
        residual_column_index_1 = Index([self.column_2])
        expected = {common_column_index, residual_column_index_1}
        self.assertEqual(result, expected)

        index_0 = Index([self.column_0, self.column_1])
        index_1 = Index([self.column_1, self.column_2])
        result = index_split(index_0, index_1)
        common_column_index = Index([self.column_1])
        residual_column_index_0 = Index([self.column_0])
        residual_column_index_1 = Index([self.column_2])
        expected = {
            common_column_index,
            residual_column_index_0,
            residual_column_index_1,
        }
        self.assertEqual(result, expected)

        # Example from Bruno's paper
        column_a = Column("a")
        column_b = Column("b")
        column_c = Column("c")
        column_d = Column("d")
        column_e = Column("e")
        column_f = Column("f")
        column_g = Column("g")

        columns = [
            column_a, column_b, column_c, column_d, column_e, column_f,
            column_g
        ]
        table = Table("TableB")
        table.add_columns(columns)

        index_1 = Index(
            [column_a, column_b, column_c, column_d, column_e, column_f])
        index_2 = Index([column_c, column_a, column_e])
        index_3 = Index([column_a, column_b, column_d, column_g])

        result = index_split(index_1, index_2)
        # expected is different from the paper, because there was an error for I_R2
        expected = {
            Index([column_a, column_c, column_e]),
            Index([column_b, column_d, column_f]),
        }
        self.assertEqual(result, expected)

        result = index_split(index_1, index_3)
        # expected is different from the paper,
        # because all columns are part of the key (there is no suffix)
        expected = {
            Index([column_a, column_b, column_d]),
            Index([column_c, column_e, column_f]),
            Index([column_g]),
        }
        self.assertEqual(result, expected)
class TestDB2AdvisAlgorithm(unittest.TestCase):
    def setUp(self):
        self.connector = MockConnector()
        self.algo = DB2AdvisAlgorithm(database_connector=self.connector)

        self.column_0 = Column("Col0")
        self.column_1 = Column("Col1")
        self.column_2 = Column("Col2")
        self.column_3 = Column("Col3")
        self.column_4 = Column("Col4")
        self.column_5 = Column("Col5")
        self.column_6 = Column("Col6")
        self.column_7 = Column("Col7")
        self.all_columns = [
            self.column_0,
            self.column_1,
            self.column_2,
            self.column_3,
            self.column_4,
            self.column_5,
            self.column_6,
            self.column_7,
        ]

        self.table = Table("Table0")
        self.table.add_columns(self.all_columns)

        self.query_0 = Query(
            0,
            "SELECT * FROM Table0 WHERE Col0 = 1 AND Col1 = 2 AND Col2 = 3;",
            self.all_columns,
        )
        # query_1 = Query(1, 'SELECT * FROM TableA WHERE ColA = 4;', [self.column_0])

    def test_db2advis_algorithm(self):
        # Should use default parameters if none are specified
        budget_in_mb = 500
        self.assertEqual(self.algo.disk_constraint, budget_in_mb * MB_TO_BYTES)
        self.assertEqual(self.algo.cost_evaluation.cost_estimation, "whatif")
        self.assertEqual(self.algo.try_variations_seconds, 10)
        self.assertEqual(self.algo.try_variations_max_removals, 4)

    def test_index_benefit__lt__(self):
        index_0 = Index([self.column_0])
        index_0.estimated_size = 1
        index_1 = Index([self.column_1])
        index_1.estimated_size = 2

        # Due to its size, index_0 has the better ratio
        index_benefit_0 = IndexBenefit(index_0, 10)
        index_benefit_1 = IndexBenefit(index_1, 10)
        self.assertTrue(index_benefit_1 < index_benefit_0)

        # The ratios are equal, the columns are taken into consideration
        index_benefit_1 = IndexBenefit(index_1, 20)
        self.assertTrue(index_benefit_0 < index_benefit_1)

    def test_calculate_index_benefits(self):
        index_0 = Index([self.column_0])
        index_0.estimated_size = 5
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1
        index_2 = Index([self.column_2])
        index_2.estimated_size = 3

        query_result_0 = {
            "cost_without_indexes": 100,
            "cost_with_indexes": 50,
            "utilized_indexes": [index_0, index_1],
        }
        # Yes, negative benefit is possible
        query_result_1 = {
            "cost_without_indexes": 50,
            "cost_with_indexes": 60,
            "utilized_indexes": [index_1],
        }
        query_result_2 = {
            "cost_without_indexes": 60,
            "cost_with_indexes": 57,
            "utilized_indexes": [index_2],
        }
        query_result_3 = {
            "cost_without_indexes": 60,
            "cost_with_indexes": 60,
            "utilized_indexes": [],
        }
        query_results = {
            "q0": query_result_0,
            "q1": query_result_1,
            "q2": query_result_2,
            "q3": query_result_3,
        }

        index_benefits = self.algo._calculate_index_benefits(
            [index_0, index_1, index_2], query_results)
        expected_index_benefits = [
            IndexBenefit(index_1, 40),
            IndexBenefit(index_0, 50),
            IndexBenefit(index_2, 3),
        ]

        self.assertEqual(index_benefits, expected_index_benefits)

    def test_combine_subsumed(self):
        index_0_1 = Index([self.column_0, self.column_1])
        index_0_1.estimated_size = 2
        index_0 = Index([self.column_0])
        index_0.estimated_size = 1
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1

        # Scenario 1. Index subsumed because better ratio for larger index
        index_benefits = [
            IndexBenefit(index_0_1, 21),
            IndexBenefit(index_0, 10)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1, 31)]
        self.assertEqual(subsumed, expected)

        # Scenario 2. Index not subsumed because better index has fewer attributes
        index_benefits = [
            IndexBenefit(index_0, 11),
            IndexBenefit(index_0_1, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0, 11), IndexBenefit(index_0_1, 20)]
        self.assertEqual(subsumed, expected)

        # Scenario 3. Index not subsumed because last element does not match
        # attribute even though better ratio
        index_0_1_2 = Index([self.column_0, self.column_1, self.column_2])
        index_0_1_2.estimated_size = 3
        index_0_2 = Index([self.column_0, self.column_2])
        index_0_2.estimated_size = 2

        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_2, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 31), IndexBenefit(index_0_2, 20)]
        self.assertEqual(subsumed, expected)

        # Scenario 4. Multi Index subsumed
        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_1, 20)
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 51)]
        self.assertEqual(subsumed, expected)

        # Scenario 5. Multiple Indexes subsumed
        index_benefits = [
            IndexBenefit(index_0_1_2, 31),
            IndexBenefit(index_0_1, 20),
            IndexBenefit(index_0, 10),
        ]
        subsumed = self.algo._combine_subsumed(index_benefits)
        expected = [IndexBenefit(index_0_1_2, 61)]
        self.assertEqual(subsumed, expected)

        # Scenario 6. Input returned if len(input) < 2
        subsumed = self.algo._combine_subsumed([IndexBenefit(index_0_1, 21)])
        expected = [IndexBenefit(index_0_1, 21)]
        self.assertEqual(subsumed, expected)

        # Scenario 7. Input not sorted by ratio throws
        with self.assertRaises(AssertionError):
            subsumed = self.algo._combine_subsumed(
                [IndexBenefit(index_0, 10),
                 IndexBenefit(index_0_1, 21)])

    def test_evaluate_workload(self):
        index_0 = Index([self.column_0])
        index_1 = Index([self.column_1])
        self.algo.cost_evaluation.calculate_cost = MagicMock()

        self.algo._evaluate_workload(
            [IndexBenefit(index_0, 10),
             IndexBenefit(index_1, 9)], workload=[])
        self.algo.cost_evaluation.calculate_cost.assert_called_once_with(
            [], [index_0, index_1])

    def test_try_variations_time_limit(self):
        index_0 = Index([self.column_0])
        index_0.estimated_size = 1
        index_1 = Index([self.column_1])
        index_1.estimated_size = 1
        index_2 = Index([self.column_2])
        index_2.estimated_size = 1
        index_3 = Index([self.column_3])
        index_3.estimated_size = 1
        index_4 = Index([self.column_4])
        index_4.estimated_size = 1
        index_5 = Index([self.column_5])
        index_5.estimated_size = 1
        index_6 = Index([self.column_6])
        index_6.estimated_size = 1
        index_7 = Index([self.column_7])
        index_7.estimated_size = 5
        self.algo.cost_evaluation.calculate_cost = MagicMock(return_value=17)
        self.algo.try_variations_seconds = 0.2

        time_before = time.time()
        self.algo._try_variations(
            selected_index_benefits=frozenset([IndexBenefit(index_0, 1)]),
            index_benefits=frozenset([IndexBenefit(index_1, 1)]),
            workload=[],
        )
        self.assertGreaterEqual(time.time(),
                                time_before + self.algo.try_variations_seconds)

        def fake(selected, workload):
            cost = 10
            if IndexBenefit(index_3, 1.5) in selected:
                cost -= 0.5
            if IndexBenefit(index_4, 0.5) in selected:
                cost += 0.5
            if IndexBenefit(index_1, 0.5) in selected:
                cost += 0.5
            if IndexBenefit(index_1, 0.5) in selected:
                cost += 0.5
            return cost

        # In this scenario a good index has not been selected (index_3).
        # We test three things:
        # (i)   That index_3 gets chosen by variation.
        # (ii)  That the weakest index from the original selection gets
        #         removed (index_1).
        # (iii) That index_4 does not get chosen even though it is better than index_1.
        self.algo._evaluate_workload = fake
        self.algo.try_variations_max_removals = 1
        self.algo.disk_constraint = 3
        new = self.algo._try_variations(
            selected_index_benefits=frozenset([
                IndexBenefit(index_0, 1),
                IndexBenefit(index_1, 0.5),
                IndexBenefit(index_2, 1),
            ]),
            index_benefits=frozenset([
                IndexBenefit(index_0, 1),
                IndexBenefit(index_1, 0.5),
                IndexBenefit(index_2, 1),
                IndexBenefit(index_3, 1.5),
                IndexBenefit(index_4, 0.6),
            ]),
            workload=[],
        )
        self.assertIn(IndexBenefit(index_3, 1.5), new)
        self.assertNotIn(IndexBenefit(index_4, 0.5), new)
        self.assertNotIn(IndexBenefit(index_1, 0.5), new)

        # Test that good index is not chosen because of storage restrictions
        new = self.algo._try_variations(
            selected_index_benefits=frozenset([IndexBenefit(index_0, 1)]),
            index_benefits=frozenset(
                [IndexBenefit(index_0, 1),
                 IndexBenefit(index_7, 5)]),
            workload=[],
        )
        self.assertEqual(new, set([IndexBenefit(index_0, 1)]))
class TestEpicAlgorithm(unittest.TestCase):
    def setUp(self):
        self.connector = MockConnector()
        self.algo = EPICAlgorithm(database_connector=self.connector)

        self.column_1 = Column("ColA")
        self.column_2 = Column("ColB")
        self.column_3 = Column("ColC")
        self.all_columns = [self.column_1, self.column_2, self.column_3]

        self.table = Table("TableA")
        self.table.add_columns(self.all_columns)

        self.index_1 = Index([self.column_1])
        self.index_1.estimated_size = 5
        self.index_2 = Index([self.column_2])
        self.index_2.estimated_size = 1
        self.index_3 = Index([self.column_3])
        self.index_3.estimated_size = 3

        query_1 = Query(0, "SELECT * FROM TableA WHERE ColA = 4;", [self.column_1])
        query_2 = Query(
            1,
            "SELECT * FROM TableA WHERE ColA = 1 AND ColB = 2 AND ColC = 3;",
            self.all_columns,
        )
        self.database_name = "test_DB"

        self.workload = Workload([query_1, query_2], self.database_name)
        self.algo.workload = self.workload

    def test_attach_to_indexes(self):
        index_combination = [self.index_1, self.index_2]
        candidate = self.index_3
        self.algo.initial_cost = 10
        best = {"combination": [], "benefit_to_size_ratio": 0}
        self.algo._evaluate_combination = MagicMock()
        self.algo._attach_to_indexes(
            index_combination, candidate, best, self.algo.initial_cost
        )

        first_new_combination = [
            index_combination[1],
            Index(index_combination[0].columns + candidate.columns),
        ]
        self.algo._evaluate_combination.assert_any_call(
            first_new_combination, best, self.algo.initial_cost, 5
        )

        second_new_combination = [
            index_combination[0],
            Index(index_combination[1].columns + candidate.columns),
        ]
        self.algo._evaluate_combination.assert_any_call(
            second_new_combination, best, self.algo.initial_cost, 1
        )

        multi_column_candidate = Index([self.column_2, self.column_3])
        with self.assertRaises(AssertionError):
            self.algo._attach_to_indexes(
                index_combination, multi_column_candidate, best, self.algo.initial_cost
            )

    def test_remove_impossible_canidates(self):
        # All Fit
        candidates = [self.index_1, self.index_2, self.index_3]
        self.algo.budget = 10
        new_candidates = self.algo._get_candidates_within_budget(
            index_combination_size=0, candidates=candidates
        )
        expected_candidates = candidates
        self.assertEqual(new_candidates, expected_candidates)

        # None fit because of algorithm budget
        self.algo.budget = 0
        new_candidates = self.algo._get_candidates_within_budget(
            index_combination_size=0, candidates=candidates
        )
        expected_candidates = []
        self.assertEqual(new_candidates, expected_candidates)

        # None fit because of index_combination_size
        self.algo.budget = 10
        new_candidates = self.algo._get_candidates_within_budget(
            index_combination_size=10, candidates=candidates
        )
        expected_candidates = []
        self.assertEqual(new_candidates, expected_candidates)

        # Some do not fit
        self.algo.budget = 4
        new_candidates = self.algo._get_candidates_within_budget(
            index_combination_size=1, candidates=candidates
        )
        expected_candidates = [self.index_2, self.index_3]
        self.assertEqual(new_candidates, expected_candidates)

        # Index with size none is not removed even though there is no space left
        self.index_1.estimated_size = None
        self.algo.budget = 0
        new_candidates = self.algo._get_candidates_within_budget(
            index_combination_size=0, candidates=candidates
        )
        expected_candidates = [self.index_1]
        self.assertEqual(new_candidates, expected_candidates)

    def test_evaluate_combination_worse_ratio(self):
        # Mock the internal algorithm state
        best_old = {"combination": [self.index_2], "benefit_to_size_ratio": 10}
        best_input = best_old.copy()
        new_index_combination = [self.index_1]

        # Mock internally called cost function
        self.algo.cost_evaluation.calculate_cost = MagicMock(return_value=4)
        self.algo.initial_cost = 14

        cost = self.algo.cost_evaluation.calculate_cost(
            self.workload, new_index_combination, store_size=True
        )
        benefit = self.algo.initial_cost - cost
        size = sum(x.estimated_size for x in new_index_combination)
        ratio = benefit / size
        assert best_old["benefit_to_size_ratio"] >= ratio

        # Above's specification leads to a benefit of 10. The index cost is 5.
        # The ratio is 2 which is worse than above's 10. Hence, best_input
        # should not change
        self.algo._evaluate_combination(
            new_index_combination, best_input, self.algo.initial_cost
        )

        expected_best = best_old
        self.assertEqual(expected_best, best_input)

    def test_evaluate_combination_better_ratio_too_large(self):
        # Mock the internal algorithm state
        self.algo.budget = 2
        best_old = {
            "combination": [self.index_2],
            "benefit_to_size_ratio": 1,
            "cost": 4,
        }
        best_input = best_old.copy()
        new_index_combination = [self.index_1]

        # Mock internally called cost function
        self.algo.cost_evaluation.calculate_cost = MagicMock(return_value=4)
        self.algo.initial_cost = 14

        cost = self.algo.cost_evaluation.calculate_cost(
            self.workload, new_index_combination, store_size=True
        )
        benefit = self.algo.initial_cost - cost
        size = sum(x.estimated_size for x in new_index_combination)
        ratio = benefit / size
        assert best_old["benefit_to_size_ratio"] < ratio

        # Above's specification leads to a benefit of 10. The index cost is 5.
        # The ratio is 2 which is better than above's 1. But the remaining budget
        # is not sufficient.
        self.algo._evaluate_combination(
            new_index_combination, best_input, self.algo.initial_cost
        )

        expected_best = best_old
        self.assertEqual(expected_best, best_input)

    def test_evaluate_combination_better_ratio(self):
        # Mock the internal algorithm state
        best_old = {
            "combination": [self.index_2],
            "benefit_to_size_ratio": 1,
        }
        best_input = best_old.copy()
        new_index_combination = [self.index_1]

        # Mock internally called cost function
        self.algo.cost_evaluation.calculate_cost = MagicMock(return_value=4)
        self.algo.initial_cost = 14

        # Above's specification leads to a benefit of 10. The index cost is 5.
        # The ratio is 2 which is better than above's 1. Hence, best_input should change
        self.algo._evaluate_combination(
            new_index_combination, best_input, self.algo.initial_cost
        )

        expected_best = {
            "combination": new_index_combination,
            "benefit_to_size_ratio": 2,
            "cost": 4,
        }
        self.assertEqual(expected_best, best_input)

    def test_epic_algoritm(self):
        # Should use default parameters if none are specified
        budget_in_mb = 10
        self.assertEqual(self.algo.budget, budget_in_mb * MB_TO_BYTES)
        self.assertEqual(self.algo.cost_evaluation.cost_estimation, "whatif")

    def _assign_size_1(self, index):
        index_sizes = {
            "tablea_cola_idx": 1,
            "tablea_colb_idx": 1,
            "tablea_colc_idx": 1,
            "tablea_colb_cola_idx": 2,
            "tablea_colb_colc_idx": 2,
            "tablea_colc_cola_idx": 2,
            "tablea_colc_colb_idx": 2,
        }

        # Assume large size if index not in mocked size table
        if index.index_idx() not in index_sizes:
            return sys.maxsize

        index.estimated_size = index_sizes[index.index_idx()]

    def _calculate_cost_mock_1(self, workload, indexes, store_size):
        for index in indexes:
            self._assign_size_1(index)

        index_combination_str = index_combination_to_str(indexes)

        index_combination_cost = {
            "": 100,
            "tablea_cola_idx": 90,
            "tablea_colb_idx": 80,
            "tablea_colc_idx": 70,
            "tablea_cola_idx||tablea_colc_idx": 60,
            "tablea_colb_idx||tablea_colc_idx": 50,
            "tablea_colc_idx||tablea_cola_idx": 60,
            "tablea_colc_idx||tablea_colb_idx": 50,
            "tablea_cola_idx||tablea_colb_idx||tablea_colc_idx": 40,
            # Below here multi, they do not result in benefit
            "tablea_colb_idx||tablea_colc_cola_idx": 1000,
            "tablea_colb_idx||tablea_colc_colb_idx": 1000,
            "tablea_colb_cola_idx||tablea_colc_idx": 1000,
            "tablea_colb_colc_idx||tablea_colc_idx": 1000,
            "tablea_colc_cola_idx": 1000,
            "tablea_colc_colb_idx": 1000,
            "tablea_colc_idx||tablea_colb_cola_idx": 1000,
            "tablea_colc_idx||tablea_colb_colc_idx": 1000,
            "tablea_colc_cola_idx||tablea_colb_idx": 1000,
            "tablea_colc_colb_idx||tablea_colb_idx": 1000,
        }

        # Assume high cost if index not in mocked cost table
        if index_combination_str not in index_combination_cost:
            return sys.maxsize

        return index_combination_cost[index_combination_str]

    # In this scenario, only single column indexes make sense.
    # They all have the same size but different benefits.
    def test_calculate_best_indexes_scenario_1(self):
        self.algo.cost_evaluation.calculate_cost = MagicMock(
            side_effect=self._calculate_cost_mock_1
        )

        # Each one alone of the single column indexes would fit,
        # but the one with the best benefit/cost ratio is chosen
        self.algo.budget = 1
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [Index([self.column_3])]
        self.assertEqual(indexes, expected_indexes)

        # Two single column indexes would fit, but the two best ones are chosen
        self.algo.budget = 2
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [Index([self.column_3]), Index([self.column_2])]
        self.assertEqual(indexes, expected_indexes)

        # All single column indexes are chosen
        self.algo.budget = 3
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [
            Index([self.column_3]),
            Index([self.column_2]),
            Index([self.column_1]),
        ]
        self.assertEqual(indexes, expected_indexes)

    def _assign_size_2(self, index):
        index_sizes = {
            "tablea_cola_idx": 1,
            "tablea_colb_idx": 3,
            "tablea_colc_idx": 5,
            "tablea_cola_colb_idx": 20,
            "tablea_cola_colc_idx": 20,
            "tablea_colb_cola_idx": 20,
            "tablea_colb_colc_idx": 20,
            "tablea_colc_cola_idx": 20,
            "tablea_colc_colb_idx": 20,
        }

        index.estimated_size = index_sizes[index.index_idx()]

    def _calculate_cost_mock_2(self, workload, indexes, store_size):
        for index in indexes:
            self._assign_size_2(index)

        index_combination_str = index_combination_to_str(indexes)

        index_combination_cost = {
            "": 100,
            "tablea_cola_idx": 90,
            "tablea_colb_idx": 80,
            "tablea_colc_idx": 70,
            "tablea_cola_idx||tablea_colb_idx": 70,
            "tablea_cola_idx||tablea_colc_idx": 60,
            "tablea_colb_idx||tablea_colc_idx": 50,
            "tablea_cola_idx||tablea_colb_idx||tablea_colc_idx": 40,
            # Below here multi, they do not result in benefit
            "tablea_cola_colb_idx": 1000,
            "tablea_cola_colc_idx": 1000,
            "tablea_colb_colc_idx": 1000,
            "tablea_cola_idx||tablea_colb_cola_idx": 1000,
            "tablea_cola_idx||tablea_colb_colc_idx": 1000,
            "tablea_cola_idx||tablea_colc_cola_idx": 1000,
            "tablea_cola_idx||tablea_colc_colb_idx": 1000,
            "tablea_cola_colb_idx||tablea_colb_idx": 1000,
            "tablea_cola_colb_idx||tablea_colc_idx": 1000,
            "tablea_cola_colc_idx||tablea_colb_idx": 1000,
        }

        # Assume high cost if index not in mocked cost table
        if index_combination_str not in index_combination_cost:
            return sys.maxsize

        return index_combination_cost[index_combination_str]

    # In this scenario, only single column indexes make sense.
    # Size and benefit are antiproportional.
    def test_calculate_best_indexes_scenario_2(self):
        self.algo.cost_evaluation.calculate_cost = MagicMock(
            side_effect=self._calculate_cost_mock_2
        )

        # There is only one index fitting the budget
        self.algo.budget = 1
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [Index([self.column_1])]
        self.assertEqual(indexes, expected_indexes)

        # Theoretically, two indexes fit, but one has a better benefit/cost ratio
        self.algo.budget = 3
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [Index([self.column_1])]
        self.assertEqual(indexes, expected_indexes)

        # The two indexes with the best ratio should be chosen
        self.algo.budget = 5
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [Index([self.column_1]), Index([self.column_2])]
        self.assertEqual(indexes, expected_indexes)

        # All single column indexes are chosen
        self.algo.budget = 9
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = [
            Index([self.column_1]),
            Index([self.column_2]),
            Index([self.column_3]),
        ]
        self.assertEqual(indexes, expected_indexes)

    def _assign_size_3(self, index):
        index_sizes = {
            "tablea_cola_idx": 2,
            "tablea_colb_idx": 1.9,
            "tablea_cola_colb_idx": 4,
            "tablea_colb_cola_idx": 3,
        }

        index.estimated_size = index_sizes[index.index_idx()]

    def _calculate_cost_mock_3(self, workload, indexes, store_size):
        for index in indexes:
            self._assign_size_3(index)

        index_combination_str = index_combination_to_str(indexes)

        index_combination_cost = {
            "": 100,
            "tablea_cola_idx": 80,
            "tablea_colb_idx": 80,
            "tablea_cola_idx||tablea_colb_idx": 70,
            # Below here multi, they do not result in benefit
            "tablea_cola_colb_idx": 60,
            "tablea_colb_cola_idx": 60,
            "tablea_colb_cola_idx||tablea_colb_idx": 60,
        }

        return index_combination_cost[index_combination_str]

    # In this scenario, multi column indexes dominate single column indexes.
    def test_calculate_best_indexes_scenario_3(self):
        query_1 = Query(
            0,
            "SELECT * FROM TableA WHERE ColA = 1 AND ColB = 2;",
            [self.column_1, self.column_2],
        )
        workload = Workload([query_1], self.database_name)
        self.algo.cost_evaluation.calculate_cost = MagicMock(
            side_effect=self._calculate_cost_mock_3
        )

        # Budget too small for multi
        self.algo.budget = 2
        indexes = self.algo._calculate_best_indexes(workload)
        expected_indexes = [Index([self.column_2])]
        self.assertEqual(indexes, expected_indexes)

        # Picks multi with best ratio
        self.algo.budget = 4
        indexes = self.algo._calculate_best_indexes(workload)
        expected_indexes = [Index([self.column_2, self.column_1])]
        self.assertEqual(indexes, expected_indexes)
Beispiel #7
0
class TestDropHeuristicAlgorithm(unittest.TestCase):
    def setUp(self):
        self.connector = MockConnector()
        self.algo = DropHeuristicAlgorithm(database_connector=self.connector)

        self.column_0 = Column("Col0")
        self.column_1 = Column("Col1")
        self.column_2 = Column("Col2")
        self.all_columns = [self.column_0, self.column_1, self.column_2]

        self.table = Table("TableA")
        self.table.add_columns(self.all_columns)

        self.index_0 = Index([self.column_0])
        self.index_1 = Index([self.column_1])
        self.index_2 = Index([self.column_2])

        query_0 = Query(0, "SELECT * FROM TableA WHERE Col0 = 4;",
                        [self.column_0])
        query_1 = Query(
            1,
            "SELECT * FROM TableA WHERE Col0 = 1 AND Col1 = 2 AND Col2 = 3;",
            self.all_columns,
        )
        self.database_name = "test_DB"

        self.workload = Workload([query_0, query_1])
        self.algo.workload = self.workload
        self.algo.cost_evaluation.calculate_cost = MagicMock(
            side_effect=self._calculate_cost_mock)

    def test_drop_heuristic_algoritm(self):
        # Should use default parameters if none are specified
        self.assertEqual(self.algo.parameters["max_indexes"], 15)
        self.assertEqual(self.algo.cost_evaluation.cost_estimation, "whatif")

    def _calculate_cost_mock(self, workload, remaining_indexes):
        assert isinstance(
            remaining_indexes, set
        ), "The cost mock for the drop heuristic should only be called with index sets"

        index_combination_str = utils.index_combination_to_str(
            remaining_indexes)

        # In the first round, the lowest cost is achieved, if col1 is dropped.
        # In the second round, if col0 is dropped.
        index_combination_cost = {
            "tablea_col0_idx||tablea_col1_idx": 80,
            "tablea_col0_idx||tablea_col2_idx": 60,
            "tablea_col1_idx||tablea_col2_idx": 70,
            "tablea_col0_idx": 110,
            "tablea_col2_idx": 100,
        }

        return index_combination_cost[index_combination_str]

    def test_calculate_best_indexes_all_fit(self):
        self.algo.parameters["max_indexes"] = 3
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = frozenset(
            [self.index_0, self.index_1, self.index_2])
        self.assertEqual(indexes, expected_indexes)

    def test_calculate_best_indexes_two_fit(self):
        self.algo.parameters["max_indexes"] = 2
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = frozenset([self.index_0, self.index_2])
        self.assertEqual(indexes, expected_indexes)
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_0, self.index_1]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_0, self.index_2]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_1, self.index_2]))
        self.assertEqual(self.algo.cost_evaluation.calculate_cost.call_count,
                         3)

    def test_calculate_best_indexes_one_fits(self):
        self.algo.parameters["max_indexes"] = 1
        indexes = self.algo._calculate_best_indexes(self.workload)
        expected_indexes = frozenset([self.index_2])
        self.assertEqual(indexes, expected_indexes)
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_0, self.index_1]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_0, self.index_2]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_1, self.index_2]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_0]))
        self.algo.cost_evaluation.calculate_cost.assert_any_call(
            self.workload, set([self.index_2]))
        self.assertEqual(self.algo.cost_evaluation.calculate_cost.call_count,
                         5)

    def test_calculate_best_indexes_none_fits(self):
        self.algo.parameters["max_indexes"] = 0
        with self.assertRaises(AssertionError):
            self.algo._calculate_best_indexes(self.workload)
from selection.cost_evaluation import CostEvaluation
from selection.index import Index
from selection.workload import Column, Query, Table
import itertools

table_A = Table("TableA")
column_A_0 = Column("Col0")
column_A_1 = Column("Col1")
column_A_2 = Column("Col2")
table_A.add_columns([column_A_0, column_A_1, column_A_2])

query_0 = Query(0, "SELECT * FROM TableA WHERE Col0 = 4;", [column_A_0])
query_1 = Query(
    1,
    "SELECT * FROM TableA WHERE Col0 = 1 AND Col1 = 2 AND Col2 = 3;",
    [column_A_0, column_A_1, column_A_2],
)

mock_cache = {}

table_A_potential_indexes = []
# Calculate potential indexes for TableA
for number_of_columns in range(1, len(table_A.columns) + 1):
    for column_list in itertools.permutations(table_A.columns, number_of_columns):
        table_A_potential_indexes.append(Index(column_list))

# Calculate relevant indexes for query_0 based on potential indexes
relevant_indexes_query1_table_A = CostEvaluation._relevant_indexes(
    query_0, table_A_potential_indexes
)