def testEstimateNodesRange(self):
        """Check the estimating touched nodes for a range predicate op"""

        col_info = self.collections[COLLECTION_NAMES[0]]
        shard_key = col_info["interesting"][0]
        col_info["fields"][shard_key]["selectivity"] = 0.5

        d = Design()
        d.addCollection(col_info["name"])
        d.addShardKey(col_info["name"], [shard_key])

        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        op["query_content"] = [
            {
                constants.REPLACE_KEY_DOLLAR_PREFIX
                + "query": {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX + "gt": 10000L}}
            }
        ]
        op["predicates"] = {shard_key: constants.PRED_TYPE_RANGE}

        # The list estimated touched nodes should contain more than one entry
        touched0 = list(self.estimator.estimateNodes(d, op))
        print "touched0:", touched0
        self.assertGreater(len(touched0), 1)
Ejemplo n.º 2
0
    def testNetworkCost(self):
        """Check network cost for equality predicate queries"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        self.assertTrue(col_info['interesting'])

        # If we shard the collection on the interesting fields, then
        # each query should only need to touch one node
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], col_info['interesting'])
        cost0 = self.cm.getCost(d)
        print "cost0: ", cost0

        # If we now shard the collection on just '_id', then every query
        # should have to touch every node. The cost of this design
        # should be greater than the first one
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], ['_id'])
        self.cm.invalidateCache(d, col_info['name'])
        self.state.reset()
        cost1 = self.cm.getCost(d)
        print "cost1: ", cost1

        self.assertLess(cost0, cost1)
    def testSkewCost(self):
        """Check whether skew cost calculations work correctly"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        # First get the skew cost when the queries got each node uniformly
        # This is the best-case scenario
        op_ctr = 0
        for sess in self.workload:
            for op in sess['operations']:
                query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                           {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\
                } ]
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
                op_ctr += 1
            ## FOR (op)
        ## FOR (session)

        col_info["fields"][shard_key]["ranges"] = range(CostModelTestCase.NUM_NODES)

        cost0 = self.cm.getCost(d)
        self.assertLessEqual(cost0, 1.0)
        #        print "skewCost0:", cost0

        # Then make all of the operations go to a single node
        # This is the worst-case scenario
        query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                                   {shard_key: 1000l }\
        } ]
        for sess in self.workload:
            for op in sess['operations']:
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
            ## FOR
        self.state.reset()
        self.cm.reset()
        cost1 = self.cm.getCost(d)
        self.assertLessEqual(cost1, 1.0)
        #        print "skewCost1:", cost1

        self.assertGreater(cost1, cost0)
    def testSkewCost(self):
        """Check whether skew cost calculations work correctly"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        # First get the skew cost when the queries got each node uniformly
        # This is the best-case scenario
        op_ctr = 0
        for sess in self.workload:
            for op in sess['operations']:
                query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                           {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\
                } ]
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
                op_ctr += 1
            ## FOR (op)
        ## FOR (session)
        cost0 = self.cm.getCost(d)
        self.assertLessEqual(cost0, 1.0)
        #        print "skewCost0:", cost0

        # Then make all of the operations go to a single node
        # This is the worst-case scenario
        query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                                   {shard_key: 1000l }\
        } ]
        for sess in self.workload:
            for op in sess['operations']:
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
            ## FOR
        self.state.reset()
        self.cm.reset()
        cost1 = self.cm.getCost(d)
        self.assertLessEqual(cost1, 1.0)
        #        print "skewCost1:", cost1

        self.assertGreater(cost1, cost0)
Ejemplo n.º 5
0
    def testEstimateNodesNullValue(self):
        """Check the estimating touched nodes when the sharding key value is null"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # This key won't be in the operation's fields, but we should still
            # be able to get back a value
            d.addShardKey(col_info['name'], ['XXXX'])
            ## FOR

        # A query that looks up on a non-sharding key should always be
        # broadcast to every node
        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        touched0 = list(self.estimator.estimateNodes(d, op))
        #        print "touched0:", touched0
        self.assertListEqual(range(NUM_NODES), touched0)

        # But if we insert into that collection with a document that doesn't
        # have the sharding key, it should only go to one node
        op['type'] = constants.OP_TYPE_INSERT
        op['query_content'] = op['resp_content']
        op['predicates'] = []
        #        pprint(op)
        touched1 = list(self.estimator.estimateNodes(d, op))
        #        print "touched1:", touched1
        self.assertEqual(1, len(touched1))

        # And if we insert another one, then we should get the same value back
        op = Session.operationFactory()
        op['collection'] = COLLECTION_NAMES[0]
        op['type'] = constants.OP_TYPE_INSERT
        op['query_id'] = 10000
        op['query_content'] = [{"parkinglot": 1234}]
        op['resp_content'] = [{"ok": 1}]
        op['resp_id'] = 10001
        #        pprint(op)
        touched2 = list(self.estimator.estimateNodes(d, op))
        self.assertEqual(1, len(touched2))
        self.assertListEqual(touched1, touched2)
    def testEstimateNodesNullValue(self):
        """Check the estimating touched nodes when the sharding key value is null"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info["name"])
            # This key won't be in the operation's fields, but we should still
            # be able to get back a value
            d.addShardKey(col_info["name"], ["XXXX"])
            ## FOR

        # A query that looks up on a non-sharding key should always be
        # broadcast to every node
        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        touched0 = list(self.estimator.estimateNodes(d, op))
        #        print "touched0:", touched0
        self.assertListEqual(range(NUM_NODES), touched0)

        # But if we insert into that collection with a document that doesn't
        # have the sharding key, it should only go to one node
        op["type"] = constants.OP_TYPE_INSERT
        op["query_content"] = op["resp_content"]
        op["predicates"] = []
        #        pprint(op)
        touched1 = list(self.estimator.estimateNodes(d, op))
        #        print "touched1:", touched1
        self.assertEqual(1, len(touched1))

        # And if we insert another one, then we should get the same value back
        op = Session.operationFactory()
        op["collection"] = COLLECTION_NAMES[0]
        op["type"] = constants.OP_TYPE_INSERT
        op["query_id"] = 10000
        op["query_content"] = [{"parkinglot": 1234}]
        op["resp_content"] = [{"ok": 1}]
        op["resp_id"] = 10001
        #        pprint(op)
        touched2 = list(self.estimator.estimateNodes(d, op))
        self.assertEqual(1, len(touched2))
        self.assertListEqual(touched1, touched2)
Ejemplo n.º 7
0
    def testEstimateNodesEquality(self):
        """Check the estimating touched nodes for a equality predicate op"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # Only put the first field in the interesting list as the sharding key
            # We'll worry about compound sharding keys later.
            d.addShardKey(col_info['name'], col_info['interesting'][:1])
        ## FOR

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        #        pprint(op)

        # If we execute it twice, we should get back the exact same node ids
        touched0 = list(self.estimator.estimateNodes(d, op))
        touched1 = list(self.estimator.estimateNodes(d, op))
        self.assertListEqual(touched0, touched1)
    def testEstimateNodesEquality(self):
        """Check the estimating touched nodes for a equality predicate op"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info["name"])
            # Only put the first field in the interesting list as the sharding key
            # We'll worry about compound sharding keys later.
            d.addShardKey(col_info["name"], col_info["interesting"][:1])
        ## FOR

        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        #        pprint(op)

        # If we execute it twice, we should get back the exact same node ids
        touched0 = list(self.estimator.estimateNodes(d, op))
        touched1 = list(self.estimator.estimateNodes(d, op))
        self.assertListEqual(touched0, touched1)
Ejemplo n.º 9
0
 def testNetworkCostDenormalization(self):
     """Check network cost for queries that reference denormalized collections"""
     # Get the "base" design cost when all of the collections
     # are sharded on their "interesting" fields
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
     ## FOR
     self.cm.reset()
     self.state.reset()
     cost0 = self.cm.getCost(d)
     
     print "cost0:", cost0
     
     # Now get the network cost for when we denormalize the
     # second collection inside of the first one
     # We should have a lower cost because there should now be fewer queries
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         self.assertTrue(col_info['interesting'])
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
         
     d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)
        
     combiner = WorkloadCombiner(self.collections, self.workload)
     combinedWorkload = combiner.process(d)
     self.state.updateWorkload(combinedWorkload)
     
     self.cm.reset()
     self.state.reset()
     cost1 = self.cm.getCost(d)
     print "cost1:", cost1
    
     self.assertLess(cost1, cost0)
Ejemplo n.º 10
0
    def testEstimateNodesRange(self):
        """Check the estimating touched nodes for a range predicate op"""

        col_info = self.collections[COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]
        col_info['fields'][shard_key]['selectivity'] = 0.5

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        op['query_content'] = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": \
                {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX+"gt": 10000l} } \
        } ]
        op['predicates'] = {shard_key: constants.PRED_TYPE_RANGE}

        # The list estimated touched nodes should contain more than one entry
        touched0 = list(self.estimator.estimateNodes(d, op))
        print "touched0:", touched0
        self.assertGreater(len(touched0), 1)
    def getManMadeDesign(self, denorm=True):
       # create a best design mannually

        d = Design()
        d.addCollection(tpccConstants.TABLENAME_ITEM)
        d.addCollection(tpccConstants.TABLENAME_WAREHOUSE)
        d.addCollection(tpccConstants.TABLENAME_DISTRICT)
        d.addCollection(tpccConstants.TABLENAME_CUSTOMER)
        d.addCollection(tpccConstants.TABLENAME_STOCK)
        d.addCollection(tpccConstants.TABLENAME_ORDERS)
        d.addCollection(tpccConstants.TABLENAME_NEW_ORDER)
        d.addCollection(tpccConstants.TABLENAME_ORDER_LINE)

        d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"])
        d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"])
        d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"])
        d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"])

        d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"])

        return d
    def getManMadeDesign(self, denorm=True):
        # create a best design mannually

        d = Design()
        d.addCollection(tpccConstants.TABLENAME_ITEM)
        d.addCollection(tpccConstants.TABLENAME_WAREHOUSE)
        d.addCollection(tpccConstants.TABLENAME_DISTRICT)
        d.addCollection(tpccConstants.TABLENAME_CUSTOMER)
        d.addCollection(tpccConstants.TABLENAME_STOCK)
        d.addCollection(tpccConstants.TABLENAME_ORDERS)
        d.addCollection(tpccConstants.TABLENAME_NEW_ORDER)
        d.addCollection(tpccConstants.TABLENAME_ORDER_LINE)

        d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"])
        d.addIndex(tpccConstants.TABLENAME_CUSTOMER,
                   ["C_W_ID", "C_D_ID", "C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_ID"])
        d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"])
        d.addIndex(tpccConstants.TABLENAME_NEW_ORDER,
                   ["NO_W_ID", "NO_D_ID", "NO_O_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDER_LINE,
                   ["OL_W_ID", "OL_D_ID", "OL_O_ID"])

        d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"])

        return d
Ejemplo n.º 13
0
    def testNetworkCostDenormalization(self):
        """Check network cost for queries that reference denormalized collections"""
        # Get the "base" design cost when all of the collections
        # are sharded on their "interesting" fields
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            if i == 0:
                d0.addShardKey(col_info['name'], col_info['interesting'])
            else:
                d0.addShardKey(col_info['name'], ["_id"])
            self.cm.invalidateCache(d0, col_info['name'])
        ## FOR
        self.cm.reset()
        self.state.reset()
        cost0 = self.cm.getCost(d0)
        
        print "cost0:", cost0
        
        # Now get the network cost for when we denormalize the
        # second collection inside of the first one
        # We should have a lower cost because there should now be fewer queries
        d1 = Design()
        for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            self.assertTrue(col_info['interesting'])
            d1.addCollection(col_info['name'])
            if i == 0:
                d1.addShardKey(col_info['name'], col_info['interesting'])
            else:
                parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
                self.assertIsNotNone(parent)
                d1.setDenormalizationParent(col_info['name'], parent['name'])
                self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name'])
                self.assertIsNotNone(d1.getDenormalizationParent(col_info['name']))
            
            self.cm.invalidateCache(d1, col_info['name'])

        combiner = WorkloadCombiner(self.collections, self.workload)
        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
        
        self.cm.reset()
        self.state.reset()
        cost1 = self.cm.getCost(d1)
        print "cost1:", cost1
       
        self.assertLess(cost1, cost0)

        # The denormalization cost should also be the same as the cost
        # when we remove all of the ops one the second collection
        backup_collections = copy.deepcopy(self.collections)

        for sess in self.state.workload:
            for op in sess["operations"]:
                if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]:
                    sess["operations"].remove(op)
            ## FOR (op)
        ## FOR (sess)
        for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)):
            del self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i]

        self.cm.reset()
        self.state.reset()
        cost2 = self.cm.getCost(d1)
        print "cost2:", cost2

        self.assertEqual(cost1, cost2)

        # Restore the original workload and see if the cost remains the same with the original one
        self.state.restoreOriginalWorkload()
        self.state.collections = backup_collections
        
        self.cm.reset()
        self.state.reset()
        cost3 = self.cm.getCost(d0)
        print "cost3:", cost3
        
        self.assertEqual(cost3, cost0)