def testEstimateNodesRange(self): """Check the estimating touched nodes for a range predicate op""" col_info = self.collections[COLLECTION_NAMES[0]] shard_key = col_info["interesting"][0] col_info["fields"][shard_key]["selectivity"] = 0.5 d = Design() d.addCollection(col_info["name"]) d.addShardKey(col_info["name"], [shard_key]) sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] op["query_content"] = [ { constants.REPLACE_KEY_DOLLAR_PREFIX + "query": {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX + "gt": 10000L}} } ] op["predicates"] = {shard_key: constants.PRED_TYPE_RANGE} # The list estimated touched nodes should contain more than one entry touched0 = list(self.estimator.estimateNodes(d, op)) print "touched0:", touched0 self.assertGreater(len(touched0), 1)
def testNetworkCost(self): """Check network cost for equality predicate queries""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] self.assertTrue(col_info['interesting']) # If we shard the collection on the interesting fields, then # each query should only need to touch one node d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], col_info['interesting']) cost0 = self.cm.getCost(d) print "cost0: ", cost0 # If we now shard the collection on just '_id', then every query # should have to touch every node. The cost of this design # should be greater than the first one d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], ['_id']) self.cm.invalidateCache(d, col_info['name']) self.state.reset() cost1 = self.cm.getCost(d) print "cost1: ", cost1 self.assertLess(cost0, cost1)
def testSkewCost(self): """Check whether skew cost calculations work correctly""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) # First get the skew cost when the queries got each node uniformly # This is the best-case scenario op_ctr = 0 for sess in self.workload: for op in sess['operations']: query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\ } ] op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } op_ctr += 1 ## FOR (op) ## FOR (session) col_info["fields"][shard_key]["ranges"] = range(CostModelTestCase.NUM_NODES) cost0 = self.cm.getCost(d) self.assertLessEqual(cost0, 1.0) # print "skewCost0:", cost0 # Then make all of the operations go to a single node # This is the worst-case scenario query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: 1000l }\ } ] for sess in self.workload: for op in sess['operations']: op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } ## FOR self.state.reset() self.cm.reset() cost1 = self.cm.getCost(d) self.assertLessEqual(cost1, 1.0) # print "skewCost1:", cost1 self.assertGreater(cost1, cost0)
def testSkewCost(self): """Check whether skew cost calculations work correctly""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) # First get the skew cost when the queries got each node uniformly # This is the best-case scenario op_ctr = 0 for sess in self.workload: for op in sess['operations']: query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\ } ] op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } op_ctr += 1 ## FOR (op) ## FOR (session) cost0 = self.cm.getCost(d) self.assertLessEqual(cost0, 1.0) # print "skewCost0:", cost0 # Then make all of the operations go to a single node # This is the worst-case scenario query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: 1000l }\ } ] for sess in self.workload: for op in sess['operations']: op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } ## FOR self.state.reset() self.cm.reset() cost1 = self.cm.getCost(d) self.assertLessEqual(cost1, 1.0) # print "skewCost1:", cost1 self.assertGreater(cost1, cost0)
def testEstimateNodesNullValue(self): """Check the estimating touched nodes when the sharding key value is null""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info['name']) # This key won't be in the operation's fields, but we should still # be able to get back a value d.addShardKey(col_info['name'], ['XXXX']) ## FOR # A query that looks up on a non-sharding key should always be # broadcast to every node sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] touched0 = list(self.estimator.estimateNodes(d, op)) # print "touched0:", touched0 self.assertListEqual(range(NUM_NODES), touched0) # But if we insert into that collection with a document that doesn't # have the sharding key, it should only go to one node op['type'] = constants.OP_TYPE_INSERT op['query_content'] = op['resp_content'] op['predicates'] = [] # pprint(op) touched1 = list(self.estimator.estimateNodes(d, op)) # print "touched1:", touched1 self.assertEqual(1, len(touched1)) # And if we insert another one, then we should get the same value back op = Session.operationFactory() op['collection'] = COLLECTION_NAMES[0] op['type'] = constants.OP_TYPE_INSERT op['query_id'] = 10000 op['query_content'] = [{"parkinglot": 1234}] op['resp_content'] = [{"ok": 1}] op['resp_id'] = 10001 # pprint(op) touched2 = list(self.estimator.estimateNodes(d, op)) self.assertEqual(1, len(touched2)) self.assertListEqual(touched1, touched2)
def testEstimateNodesNullValue(self): """Check the estimating touched nodes when the sharding key value is null""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info["name"]) # This key won't be in the operation's fields, but we should still # be able to get back a value d.addShardKey(col_info["name"], ["XXXX"]) ## FOR # A query that looks up on a non-sharding key should always be # broadcast to every node sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] touched0 = list(self.estimator.estimateNodes(d, op)) # print "touched0:", touched0 self.assertListEqual(range(NUM_NODES), touched0) # But if we insert into that collection with a document that doesn't # have the sharding key, it should only go to one node op["type"] = constants.OP_TYPE_INSERT op["query_content"] = op["resp_content"] op["predicates"] = [] # pprint(op) touched1 = list(self.estimator.estimateNodes(d, op)) # print "touched1:", touched1 self.assertEqual(1, len(touched1)) # And if we insert another one, then we should get the same value back op = Session.operationFactory() op["collection"] = COLLECTION_NAMES[0] op["type"] = constants.OP_TYPE_INSERT op["query_id"] = 10000 op["query_content"] = [{"parkinglot": 1234}] op["resp_content"] = [{"ok": 1}] op["resp_id"] = 10001 # pprint(op) touched2 = list(self.estimator.estimateNodes(d, op)) self.assertEqual(1, len(touched2)) self.assertListEqual(touched1, touched2)
def testEstimateNodesEquality(self): """Check the estimating touched nodes for a equality predicate op""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info['name']) # Only put the first field in the interesting list as the sharding key # We'll worry about compound sharding keys later. d.addShardKey(col_info['name'], col_info['interesting'][:1]) ## FOR sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] # pprint(op) # If we execute it twice, we should get back the exact same node ids touched0 = list(self.estimator.estimateNodes(d, op)) touched1 = list(self.estimator.estimateNodes(d, op)) self.assertListEqual(touched0, touched1)
def testEstimateNodesEquality(self): """Check the estimating touched nodes for a equality predicate op""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info["name"]) # Only put the first field in the interesting list as the sharding key # We'll worry about compound sharding keys later. d.addShardKey(col_info["name"], col_info["interesting"][:1]) ## FOR sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] # pprint(op) # If we execute it twice, we should get back the exact same node ids touched0 = list(self.estimator.estimateNodes(d, op)) touched1 = list(self.estimator.estimateNodes(d, op)) self.assertListEqual(touched0, touched1)
def testNetworkCostDenormalization(self): """Check network cost for queries that reference denormalized collections""" # Get the "base" design cost when all of the collections # are sharded on their "interesting" fields d = Design() i = 0 for col_info in self.collections.itervalues(): d.addCollection(col_info['name']) if i == 0: d.addShardKey(col_info['name'], col_info['interesting']) else: d.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d, col_info['name']) i += 1 ## FOR self.cm.reset() self.state.reset() cost0 = self.cm.getCost(d) print "cost0:", cost0 # Now get the network cost for when we denormalize the # second collection inside of the first one # We should have a lower cost because there should now be fewer queries d = Design() i = 0 for col_info in self.collections.itervalues(): self.assertTrue(col_info['interesting']) d.addCollection(col_info['name']) if i == 0: d.addShardKey(col_info['name'], col_info['interesting']) else: d.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d, col_info['name']) i += 1 d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) combiner = WorkloadCombiner(self.collections, self.workload) combinedWorkload = combiner.process(d) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.state.reset() cost1 = self.cm.getCost(d) print "cost1:", cost1 self.assertLess(cost1, cost0)
def testEstimateNodesRange(self): """Check the estimating touched nodes for a range predicate op""" col_info = self.collections[COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] col_info['fields'][shard_key]['selectivity'] = 0.5 d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] op['query_content'] = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": \ {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX+"gt": 10000l} } \ } ] op['predicates'] = {shard_key: constants.PRED_TYPE_RANGE} # The list estimated touched nodes should contain more than one entry touched0 = list(self.estimator.estimateNodes(d, op)) print "touched0:", touched0 self.assertGreater(len(touched0), 1)
def getManMadeDesign(self, denorm=True): # create a best design mannually d = Design() d.addCollection(tpccConstants.TABLENAME_ITEM) d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) d.addCollection(tpccConstants.TABLENAME_DISTRICT) d.addCollection(tpccConstants.TABLENAME_CUSTOMER) d.addCollection(tpccConstants.TABLENAME_STOCK) d.addCollection(tpccConstants.TABLENAME_ORDERS) d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) return d
def getManMadeDesign(self, denorm=True): # create a best design mannually d = Design() d.addCollection(tpccConstants.TABLENAME_ITEM) d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) d.addCollection(tpccConstants.TABLENAME_DISTRICT) d.addCollection(tpccConstants.TABLENAME_CUSTOMER) d.addCollection(tpccConstants.TABLENAME_STOCK) d.addCollection(tpccConstants.TABLENAME_ORDERS) d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID", "C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) return d
def testNetworkCostDenormalization(self): """Check network cost for queries that reference denormalized collections""" # Get the "base" design cost when all of the collections # are sharded on their "interesting" fields d0 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d0.addCollection(col_info['name']) if i == 0: d0.addShardKey(col_info['name'], col_info['interesting']) else: d0.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d0, col_info['name']) ## FOR self.cm.reset() self.state.reset() cost0 = self.cm.getCost(d0) print "cost0:", cost0 # Now get the network cost for when we denormalize the # second collection inside of the first one # We should have a lower cost because there should now be fewer queries d1 = Design() for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] self.assertTrue(col_info['interesting']) d1.addCollection(col_info['name']) if i == 0: d1.addShardKey(col_info['name'], col_info['interesting']) else: parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] self.assertIsNotNone(parent) d1.setDenormalizationParent(col_info['name'], parent['name']) self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name']) self.assertIsNotNone(d1.getDenormalizationParent(col_info['name'])) self.cm.invalidateCache(d1, col_info['name']) combiner = WorkloadCombiner(self.collections, self.workload) combinedWorkload = combiner.process(d1) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.state.reset() cost1 = self.cm.getCost(d1) print "cost1:", cost1 self.assertLess(cost1, cost0) # The denormalization cost should also be the same as the cost # when we remove all of the ops one the second collection backup_collections = copy.deepcopy(self.collections) for sess in self.state.workload: for op in sess["operations"]: if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]: sess["operations"].remove(op) ## FOR (op) ## FOR (sess) for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)): del self.collections[CostModelTestCase.COLLECTION_NAMES[i]] print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i] self.cm.reset() self.state.reset() cost2 = self.cm.getCost(d1) print "cost2:", cost2 self.assertEqual(cost1, cost2) # Restore the original workload and see if the cost remains the same with the original one self.state.restoreOriginalWorkload() self.state.collections = backup_collections self.cm.reset() self.state.reset() cost3 = self.cm.getCost(d0) print "cost3:", cost3 self.assertEqual(cost3, cost0)