Ejemplo n.º 1
0
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)

        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
    def testDiskCostNotChangedAfterQueryCombination(self):
        """Disk cost should not be changed after query combination"""
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
        
        cost0 = self.cm.getCost(d)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
            self.state.invalidateCache(col_name)
            
        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)

        print "cost1 " + str(cost1)
        
        self.assertEqual(cost0, cost1)
Ejemplo n.º 4
0
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1

        print "number of queries after query combination: " + str(
            number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries,
                           number_of_queries_from_combined_workload)
Ejemplo n.º 5
0
 def testNetworkCostDenormalization(self):
     """Check network cost for queries that reference denormalized collections"""
     # Get the "base" design cost when all of the collections
     # are sharded on their "interesting" fields
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
     ## FOR
     self.cm.reset()
     self.state.reset()
     cost0 = self.cm.getCost(d)
     
     print "cost0:", cost0
     
     # Now get the network cost for when we denormalize the
     # second collection inside of the first one
     # We should have a lower cost because there should now be fewer queries
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         self.assertTrue(col_info['interesting'])
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
         
     d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)
        
     combiner = WorkloadCombiner(self.collections, self.workload)
     combinedWorkload = combiner.process(d)
     self.state.updateWorkload(combinedWorkload)
     
     self.cm.reset()
     self.state.reset()
     cost1 = self.cm.getCost(d)
     print "cost1:", cost1
    
     self.assertLess(cost1, cost0)
Ejemplo n.º 6
0
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        
        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])
            
        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)
        
        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 7
0
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 8
0
    def testDenormalizer(self):
        d = Design()
        for col_name in self.col_names:
            d.addCollection(col_name)
        ## FOR
        op_list = self.printOperations()
        col_list = self.printAllCollections()
        d.setDenormalizationParent("koalas", "apples")

        dn = Denormalizer(self.metadata_db, self.dataset_db, d)
        dn.process()

        new_op_list = self.printOperations()
        new_col_list = self.printAllCollections()

        self.assertTrue("koalas" not in new_op_list)
        self.assertTrue("koalas" not in new_col_list)
Ejemplo n.º 9
0
 def testDenormalizer(self):
     d = Design()
     for col_name in self.col_names:
         d.addCollection(col_name)
     ## FOR
     op_list = self.printOperations()
     col_list = self.printAllCollections()
     d.setDenormalizationParent("koalas", "apples")
     
     dn = Denormalizer(self.metadata_db, self.dataset_db, d)
     dn.process()
     
     new_op_list = self.printOperations()
     new_col_list = self.printAllCollections()
     
     self.assertTrue("koalas" not in new_op_list)
     self.assertTrue("koalas" not in new_col_list)
Ejemplo n.º 10
0
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 11
0
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 12
0
    def testGuessIndex_IndexSizeEstimation_Denormalization(self):
        """
            If collection A is denormalized into B, then the index for collection B should have larger size now
            (If and only if the index is built on a field that is included by both collection A and collection B)
        """
        d = Design()
        d.addCollection("apple")
        d.addCollection("microsoft")
        d.addCollection("google")

        d.addIndex("apple", ["field00"])
        d.addIndex("microsoft", ["field00"])
        d.addIndex("google", ["field00"])

        # op4 use index (field00) but it only goes to collection microsoft
        op4 = self.ops[4]

        # Guess index

        # Without denormalization
        best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(
            d, op4)

        # With one denormalization
        d.setDenormalizationParent("apple", "microsoft")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_1, index_size_0)

        # With chained denormalization
        self.cm.reset()
        d.setDenormalizationParent("google", "apple")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_2, index_size_1)
 def testGuessIndex_IndexSizeEstimation_Denormalization(self):
     """
         If collection A is denormalized into B, then the index for collection B should have larger size now
         (If and only if the index is built on a field that is included by both collection A and collection B)
     """
     d = Design()
     d.addCollection("apple")
     d.addCollection("microsoft")
     d.addCollection("google")
     
     d.addIndex("apple", ["field00"])
     d.addIndex("microsoft", ["field00"])
     d.addIndex("google", ["field00"])
     
     # op4 use index (field00) but it only goes to collection microsoft
     op4 = self.ops[4]
     
     # Guess index
     
     # Without denormalization
     best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(d, op4)
     
     # With one denormalization
     d.setDenormalizationParent("apple", "microsoft")
     self.cm.buildEmbeddingCostDictionary(d)
     best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(d, op4)
     
     self.assertGreater(index_size_1, index_size_0)
     
     # With chained denormalization
     self.cm.reset()
     d.setDenormalizationParent("google", "apple")
     self.cm.buildEmbeddingCostDictionary(d)
     best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(d, op4)
     
     self.assertGreater(index_size_2, index_size_1)
Ejemplo n.º 14
0
    def testNetworkCostDenormalization(self):
        """Check network cost for queries that reference denormalized collections"""
        # Get the "base" design cost when all of the collections
        # are sharded on their "interesting" fields
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            if i == 0:
                d0.addShardKey(col_info['name'], col_info['interesting'])
            else:
                d0.addShardKey(col_info['name'], ["_id"])
            self.cm.invalidateCache(d0, col_info['name'])
        ## FOR
        self.cm.reset()
        self.state.reset()
        cost0 = self.cm.getCost(d0)
        
        print "cost0:", cost0
        
        # Now get the network cost for when we denormalize the
        # second collection inside of the first one
        # We should have a lower cost because there should now be fewer queries
        d1 = Design()
        for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            self.assertTrue(col_info['interesting'])
            d1.addCollection(col_info['name'])
            if i == 0:
                d1.addShardKey(col_info['name'], col_info['interesting'])
            else:
                parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
                self.assertIsNotNone(parent)
                d1.setDenormalizationParent(col_info['name'], parent['name'])
                self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name'])
                self.assertIsNotNone(d1.getDenormalizationParent(col_info['name']))
            
            self.cm.invalidateCache(d1, col_info['name'])

        combiner = WorkloadCombiner(self.collections, self.workload)
        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
        
        self.cm.reset()
        self.state.reset()
        cost1 = self.cm.getCost(d1)
        print "cost1:", cost1
       
        self.assertLess(cost1, cost0)

        # The denormalization cost should also be the same as the cost
        # when we remove all of the ops one the second collection
        backup_collections = copy.deepcopy(self.collections)

        for sess in self.state.workload:
            for op in sess["operations"]:
                if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]:
                    sess["operations"].remove(op)
            ## FOR (op)
        ## FOR (sess)
        for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)):
            del self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i]

        self.cm.reset()
        self.state.reset()
        cost2 = self.cm.getCost(d1)
        print "cost2:", cost2

        self.assertEqual(cost1, cost2)

        # Restore the original workload and see if the cost remains the same with the original one
        self.state.restoreOriginalWorkload()
        self.state.collections = backup_collections
        
        self.cm.reset()
        self.state.reset()
        cost3 = self.cm.getCost(d0)
        print "cost3:", cost3
        
        self.assertEqual(cost3, cost0)