def testDiskCostIndexes(self): """Check whether disk cost calculations work correctly""" # First get the disk cost when there are no indexes d = Design() col_info = self.collections[CostModelTestCase.COLLECTION_NAME] d.addCollection(col_info['name']) cost0 = self.cm.getCost(d) print "diskCost0:", cost0 # The cost should be exactly equal to one, which means that every operation # has to perform a full sequential scan on the collection self.assertEqual(cost0, 1.0) # Now add the all indexes. The disk cost should be lower d = Design() col_info = self.collections[CostModelTestCase.COLLECTION_NAME] d.addCollection(col_info['name']) d.addIndex(col_info['name'], col_info['interesting']) self.state.invalidateCache(col_info['name']) self.cm.reset() self.cm.state.reset() cost1 = self.cm.getCost(d) print "diskCost1:", cost1 self.assertGreater(cost0, cost1)
def testQueriesCombination(self): """Test if the total number of queries are reduced""" original_number_of_queries = 0 for sess in self.workload: for op in sess["operations"]: original_number_of_queries += 1 print "orignal number of queries: " + str(original_number_of_queries) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d.addCollection(col_info['name']) d.setDenormalizationParent("koalas", "apples") combinedWorkload = combiner.process(d) number_of_queries_from_combined_workload = 0 for sess in combinedWorkload: for op in sess["operations"]: number_of_queries_from_combined_workload += 1 print "number of queries after query combination: " + str(number_of_queries_from_combined_workload) self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
def testInitializePreloading(self): """Check whether preloading the buffer works properly""" num_collections = 5 collections = dict() self.design = Design() for i in xrange(num_collections): col_name = "col%02d" % i col_info = catalog.Collection() col_info['name'] = col_name col_info['doc_count'] = NUM_DOCUMENTS col_info['workload_percent'] = 1 / float(num_collections) col_info['avg_doc_size'] = 1024 collections[col_name] = col_info self.design.addCollection(col_name) ## FOR self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True) try: self.buffer.initialize(self.design) self.buffer.validate() except: print self.buffer raise
def testQueriesCombination(self): """Test if the total number of queries are reduced""" original_number_of_queries = 0 for sess in self.workload: for op in sess["operations"]: original_number_of_queries += 1 print "orignal number of queries: " + str(original_number_of_queries) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d = Design() for col_name in self.collections.iterkeys(): d.addCollection(col_name) d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) combinedWorkload = combiner.process(d) number_of_queries_from_combined_workload = 0 for sess in combinedWorkload: for op in sess["operations"]: number_of_queries_from_combined_workload += 1 print "number of queries after query combination: " + str(number_of_queries_from_combined_workload) self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
def testEstimateNodesRange(self): """Check the estimating touched nodes for a range predicate op""" col_info = self.collections[COLLECTION_NAMES[0]] shard_key = col_info["interesting"][0] col_info["fields"][shard_key]["selectivity"] = 0.5 d = Design() d.addCollection(col_info["name"]) d.addShardKey(col_info["name"], [shard_key]) sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] op["query_content"] = [ { constants.REPLACE_KEY_DOLLAR_PREFIX + "query": {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX + "gt": 10000L}} } ] op["predicates"] = {shard_key: constants.PRED_TYPE_RANGE} # The list estimated touched nodes should contain more than one entry touched0 = list(self.estimator.estimateNodes(d, op)) print "touched0:", touched0 self.assertGreater(len(touched0), 1)
def setUp(self): TPCCTestCase.setUp(self) self.config = configutil.makeDefaultConfig() self.designer = InitialDesigner(self.collections, self.workload, self.config) self.col_keys = self.designer.generateCollectionHistograms() self.design = Design() map(self.design.addCollection, self.col_keys.iterkeys())
def testSameDesignExecutedTwice_withemptydesign(self): """ If the same design is executed twice, they should have the same result """ d = Design() for col_name in CostModelTestCase.COLLECTION_NAMES: d.addCollection(col_name) ## for cost0 = self.cm.overallCost(d) cost1 = self.cm.overallCost(d) self.assertEqual(cost0, cost1)
def testGuessIndex_indexChooseWithProjectionField(self): """ If a query uses one of the indexes the design has but its projection uses one of the indexes the design has, we should choose the index with both query index and projection index """ # If we have a design with index (field00), (field00, field02) # 1. query uses field00 but its projection field is {field02: xx} # result: we should choose (field00, field02) as the best index # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field02"]) d.addIndex("apple", ["field00"]) op = self.ops[0] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field00") self.assertEqual(best_index[1], "field02") self.assertTrue(covering)
def testDiskCostNotChangedAfterQueryCombination(self): """Disk cost should not be changed after query combination""" d = Design() d = Design() for col_name in self.collections.iterkeys(): d.addCollection(col_name) cost0 = self.cm.getCost(d) print "cost0 " + str(cost0) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d = Design() d = Design() for col_name in self.collections.iterkeys(): d.addCollection(col_name) self.state.invalidateCache(col_name) d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) combinedWorkload = combiner.process(d) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.cm.state.reset() cost1 = self.cm.getCost(d) print "cost1 " + str(cost1) self.assertEqual(cost0, cost1)
def testNotCollectionEmbeddingProcessShouldReturnNone(self): """ If the given design has no collection embedding, we should return right away """ d0 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d0.addCollection(col_info['name']) d0.addIndex(col_info['name'], ['field00', 'field02']) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) combinedWorkload = combiner.process(d0) self.assertEqual(None, combinedWorkload)
def testSkewCost(self): """Check whether skew cost calculations work correctly""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) # First get the skew cost when the queries got each node uniformly # This is the best-case scenario op_ctr = 0 for sess in self.workload: for op in sess['operations']: query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\ } ] op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } op_ctr += 1 ## FOR (op) ## FOR (session) col_info["fields"][shard_key]["ranges"] = range(CostModelTestCase.NUM_NODES) cost0 = self.cm.getCost(d) self.assertLessEqual(cost0, 1.0) # print "skewCost0:", cost0 # Then make all of the operations go to a single node # This is the worst-case scenario query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: 1000l }\ } ] for sess in self.workload: for op in sess['operations']: op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } ## FOR self.state.reset() self.cm.reset() cost1 = self.cm.getCost(d) self.assertLessEqual(cost1, 1.0) # print "skewCost1:", cost1 self.assertGreater(cost1, cost0)
def testDenormalizer(self): d = Design() for col_name in self.col_names: d.addCollection(col_name) ## FOR op_list = self.printOperations() col_list = self.printAllCollections() d.setDenormalizationParent("koalas", "apples") dn = Denormalizer(self.metadata_db, self.dataset_db, d) dn.process() new_op_list = self.printOperations() new_col_list = self.printAllCollections() self.assertTrue("koalas" not in new_op_list) self.assertTrue("koalas" not in new_col_list)
def testGuessIndex_IndexSizeEstimation(self): """ Check if the size of the indexes vary """ d = Design() d.addCollection("apple") d.addIndex("apple", ["field00"]) d.addIndex("apple", ["field01"]) d.addIndex("apple", ["field00", "field01"]) # op0 use index (field00) op0 = self.ops[0] # op1 use index (field01) op1 = self.ops[1] # op2 use index (field01, field00) op2 = self.ops[2] # op3 use index (field00, field01) op3 = self.ops[3] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op0) self.assertEqual(24+8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op1) self.assertEqual(24+8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op2) self.assertEqual(24+24+8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op3) self.assertEqual(24+24+8, index_size)
def testGuessIndex_indexChooseWithProjectionField(self): """ If a query uses one of the indexes the design has but its projection uses one of the indexes the design has, we should choose the index with both query index and projection index """ # If we have a design with index (field00), (field00, field02) # 1. query uses field00 but its projection field is {field02: xx} # result: we should choose (field00, field02) as the best index # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field02"]) d.addIndex("apple", ["field00"]) op = self.ops[0] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field00") self.assertEqual(best_index[1], "field02") self.assertTrue(covering)
def testEstimateNodesNullValue(self): """Check the estimating touched nodes when the sharding key value is null""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info["name"]) # This key won't be in the operation's fields, but we should still # be able to get back a value d.addShardKey(col_info["name"], ["XXXX"]) ## FOR # A query that looks up on a non-sharding key should always be # broadcast to every node sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] touched0 = list(self.estimator.estimateNodes(d, op)) # print "touched0:", touched0 self.assertListEqual(range(NUM_NODES), touched0) # But if we insert into that collection with a document that doesn't # have the sharding key, it should only go to one node op["type"] = constants.OP_TYPE_INSERT op["query_content"] = op["resp_content"] op["predicates"] = [] # pprint(op) touched1 = list(self.estimator.estimateNodes(d, op)) # print "touched1:", touched1 self.assertEqual(1, len(touched1)) # And if we insert another one, then we should get the same value back op = Session.operationFactory() op["collection"] = COLLECTION_NAMES[0] op["type"] = constants.OP_TYPE_INSERT op["query_id"] = 10000 op["query_content"] = [{"parkinglot": 1234}] op["resp_content"] = [{"ok": 1}] op["resp_id"] = 10001 # pprint(op) touched2 = list(self.estimator.estimateNodes(d, op)) self.assertEqual(1, len(touched2)) self.assertListEqual(touched1, touched2)
def testEstimateNodesEquality(self): """Check the estimating touched nodes for a equality predicate op""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info["name"]) # Only put the first field in the interesting list as the sharding key # We'll worry about compound sharding keys later. d.addShardKey(col_info["name"], col_info["interesting"][:1]) ## FOR sess = self.metadata_db.Session.fetch_one() op = sess["operations"][0] # pprint(op) # If we execute it twice, we should get back the exact same node ids touched0 = list(self.estimator.estimateNodes(d, op)) touched1 = list(self.estimator.estimateNodes(d, op)) self.assertListEqual(touched0, touched1)
def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS + 1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def testQueriesCombination(self): """Test if the total number of queries are reduced""" original_number_of_queries = 0 for sess in self.workload: for op in sess["operations"]: original_number_of_queries += 1 print "orignal number of queries: " + str(original_number_of_queries) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d.addCollection(col_info['name']) d.setDenormalizationParent("koalas", "apples") combinedWorkload = combiner.process(d) number_of_queries_from_combined_workload = 0 for sess in combinedWorkload: for op in sess["operations"]: number_of_queries_from_combined_workload += 1 print "number of queries after query combination: " + str( number_of_queries_from_combined_workload) self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
def testGuessIndex_indexChooseWithoutProjectionField(self): """ If a query uses all the indexes but doesn't have a projection field, we still think it is not a covering index """ # If we have a design with indexes(field00, field01) # 1. query uses (field00, field01) but there is no projection field # result: we should choose (field00, field02) but the index is not a covering index # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) op = self.ops[3] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(best_index[0], "field00") self.assertEqual(best_index[1], "field01") self.assertFalse(covering)
def testGuessIndex_IndexSizeEstimation(self): """ Check if the size of the indexes vary """ d = Design() d.addCollection("apple") d.addIndex("apple", ["field00"]) d.addIndex("apple", ["field01"]) d.addIndex("apple", ["field00", "field01"]) # op0 use index (field00) op0 = self.ops[0] # op1 use index (field01) op1 = self.ops[1] # op2 use index (field01, field00) op2 = self.ops[2] # op3 use index (field00, field01) op3 = self.ops[3] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op0) self.assertEqual(24 + 8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op1) self.assertEqual(24 + 8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op2) self.assertEqual(24 + 24 + 8, index_size) best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op3) self.assertEqual(24 + 24 + 8, index_size)
def testGuessIndex_consistentAnswer(self): """Check that guessIndex always returns the same answer for the same input""" # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) d.addIndex("apple", ["field01", "field00"]) d.addIndex("apple", ["field00"]) d.addIndex("apple", ["field01"]) for i in xrange(len(self.ops) - 2): op = self.ops[i] last_index, last_covering = (None, None) for i in xrange(100): best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertIsNotNone(best_index) self.assertIsNotNone(covering) if not last_index is None: self.assertEqual(last_index, best_index) self.assertEqual(last_covering, covering) last_index, last_covering = (best_index, covering)
def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS+1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def testDiskCostCaching(self): """Check whether disk cost calculations work correctly with caching enabled""" self.cm.cache_enable = True # Give the mofo a full Design with indexes d = Design() col_info = self.collections[CostModelTestCase.COLLECTION_NAME] d.addCollection(col_info['name']) d.addIndex(col_info['name'], col_info['interesting']) ## FOR cost0 = self.cm.getCost(d) print "diskCost0:", cost0 # FIXME self.assertGreater(cost0, 0.0) # We should get the same cost back after we execute it a second time cost1 = self.cm.getCost(d) print "diskCost1:", cost1
def testSkewCost(self): """Check whether skew cost calculations work correctly""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) # First get the skew cost when the queries got each node uniformly # This is the best-case scenario op_ctr = 0 for sess in self.workload: for op in sess['operations']: query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\ } ] op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } op_ctr += 1 ## FOR (op) ## FOR (session) cost0 = self.cm.getCost(d) self.assertLessEqual(cost0, 1.0) # print "skewCost0:", cost0 # Then make all of the operations go to a single node # This is the worst-case scenario query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ {shard_key: 1000l }\ } ] for sess in self.workload: for op in sess['operations']: op['collection'] = col_info['name'] op['query_content'] = query_content op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } ## FOR self.state.reset() self.cm.reset() cost1 = self.cm.getCost(d) self.assertLessEqual(cost1, 1.0) # print "skewCost1:", cost1 self.assertGreater(cost1, cost0)
def testGuessIndex_consistentAnswer(self): """Check that guessIndex always returns the same answer for the same input""" # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) d.addIndex("apple", ["field01", "field00"]) d.addIndex("apple", ["field00"]) d.addIndex("apple", ["field01"]) for i in xrange(len(self.ops) - 2): op = self.ops[i] last_index, last_covering = (None, None) for i in xrange(100): best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertIsNotNone(best_index) self.assertIsNotNone(covering) if not last_index is None: self.assertEqual(last_index, best_index) self.assertEqual(last_covering, covering) last_index, last_covering = (best_index, covering)
def testEstimateNodesNullValue(self): """Check the estimating touched nodes when the sharding key value is null""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info['name']) # This key won't be in the operation's fields, but we should still # be able to get back a value d.addShardKey(col_info['name'], ['XXXX']) ## FOR # A query that looks up on a non-sharding key should always be # broadcast to every node sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] touched0 = list(self.estimator.estimateNodes(d, op)) # print "touched0:", touched0 self.assertListEqual(range(NUM_NODES), touched0) # But if we insert into that collection with a document that doesn't # have the sharding key, it should only go to one node op['type'] = constants.OP_TYPE_INSERT op['query_content'] = op['resp_content'] op['predicates'] = [] # pprint(op) touched1 = list(self.estimator.estimateNodes(d, op)) # print "touched1:", touched1 self.assertEqual(1, len(touched1)) # And if we insert another one, then we should get the same value back op = Session.operationFactory() op['collection'] = COLLECTION_NAMES[0] op['type'] = constants.OP_TYPE_INSERT op['query_id'] = 10000 op['query_content'] = [{"parkinglot": 1234}] op['resp_content'] = [{"ok": 1}] op['resp_id'] = 10001 # pprint(op) touched2 = list(self.estimator.estimateNodes(d, op)) self.assertEqual(1, len(touched2)) self.assertListEqual(touched1, touched2)
def testEstimateNodesEquality(self): """Check the estimating touched nodes for a equality predicate op""" d = Design() for i in xrange(0, len(COLLECTION_NAMES)): col_info = self.collections[COLLECTION_NAMES[i]] d.addCollection(col_info['name']) # Only put the first field in the interesting list as the sharding key # We'll worry about compound sharding keys later. d.addShardKey(col_info['name'], col_info['interesting'][:1]) ## FOR sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] # pprint(op) # If we execute it twice, we should get back the exact same node ids touched0 = list(self.estimator.estimateNodes(d, op)) touched1 = list(self.estimator.estimateNodes(d, op)) self.assertListEqual(touched0, touched1)
def testEstimateNodesRange(self): """Check the estimating touched nodes for a range predicate op""" col_info = self.collections[COLLECTION_NAMES[0]] shard_key = col_info['interesting'][0] col_info['fields'][shard_key]['selectivity'] = 0.5 d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], [shard_key]) sess = self.metadata_db.Session.fetch_one() op = sess['operations'][0] op['query_content'] = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": \ {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX+"gt": 10000l} } \ } ] op['predicates'] = {shard_key: constants.PRED_TYPE_RANGE} # The list estimated touched nodes should contain more than one entry touched0 = list(self.estimator.estimateNodes(d, op)) print "touched0:", touched0 self.assertGreater(len(touched0), 1)
def testGuessIndex_indexChooseWithoutProjectionField(self): """ If a query uses all the indexes but doesn't have a projection field, we still think it is not a covering index """ # If we have a design with indexes(field00, field01) # 1. query uses (field00, field01) but there is no projection field # result: we should choose (field00, field02) but the index is not a covering index # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) op = self.ops[3] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(best_index[0], "field00") self.assertEqual(best_index[1], "field01") self.assertFalse(covering)
def testDiskCostChangesAfterQueryCombination(self): """ Assume we have collection A, B, C and we want to embed C to A If we build index on field00 of A and field02 of C The cost after query combination should be lower """ d0 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d0.addCollection(col_info['name']) d0.addIndex(col_info['name'], ['field00', 'field02']) cost0 = self.cm.getCost(d0) print "cost0 " + str(cost0) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d1 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d1.addCollection(col_info['name']) d1.addIndex(col_info['name'], ['field00', 'field02']) self.state.invalidateCache(col_info['name']) d1.setDenormalizationParent("koalas", "apples") combinedWorkload = combiner.process(d1) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.cm.state.reset() cost1 = self.cm.getCost(d1) print "cost1 " + str(cost1) self.assertGreater(cost0, cost1) # Cost should remain the same after restoring the original workload self.state.restoreOriginalWorkload() self.cm.reset() print "child collection ", self.cm.child_collections self.cm.state.reset() cost2 = self.cm.getCost(d0) print "cost2 " + str(cost2) self.assertEqual(cost2, cost0)
def testGuessIndex_indexInIncorrectOrder(self): """ Design with index (field01, field00) 1. query uses index (field00) result: not using index because that query uses indexes in order 2. query uses index (field01) result: using index (field01, field00) because this is the best match 3. query uses index (field01, field00) result: using index (field01, field00) because they match the best Design with index (field00, field01) 4. query uses index (field01, field00) result: using no index because the index order is not correct Design with index (field01, field02, field00) 5. query uses index (field01, field00) result: using index (field01, field02, field00) because they match the best result: not cover index because the index order in design is not correct """ # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field00"]) # query 1: get query, queries on field00 op = self.ops[0] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(best_index, None) self.assertFalse(covering) # query 2: get query, queries on field01 op = self.ops[1] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field00") self.assertFalse(covering) # query 3: get query, queries on field01 and field00 op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field00") self.assertFalse(covering) # query 4: d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 2) self.assertFalse(covering) # query 5: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field02", "field00"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 3) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field02") self.assertEqual(best_index[2], "field00") self.assertFalse(covering)
def testNetworkCostDenormalization(self): """Check network cost for queries that reference denormalized collections""" # Get the "base" design cost when all of the collections # are sharded on their "interesting" fields d = Design() i = 0 for col_info in self.collections.itervalues(): d.addCollection(col_info['name']) if i == 0: d.addShardKey(col_info['name'], col_info['interesting']) else: d.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d, col_info['name']) i += 1 ## FOR self.cm.reset() self.state.reset() cost0 = self.cm.getCost(d) print "cost0:", cost0 # Now get the network cost for when we denormalize the # second collection inside of the first one # We should have a lower cost because there should now be fewer queries d = Design() i = 0 for col_info in self.collections.itervalues(): self.assertTrue(col_info['interesting']) d.addCollection(col_info['name']) if i == 0: d.addShardKey(col_info['name'], col_info['interesting']) else: d.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d, col_info['name']) i += 1 d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) combiner = WorkloadCombiner(self.collections, self.workload) combinedWorkload = combiner.process(d) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.state.reset() cost1 = self.cm.getCost(d) print "cost1:", cost1 self.assertLess(cost1, cost0)
class TestInitialDesigner(TPCCTestCase): def setUp(self): TPCCTestCase.setUp(self) self.config = configutil.makeDefaultConfig() self.designer = InitialDesigner(self.collections, self.workload, self.config) self.col_keys = self.designer.generateCollectionHistograms() self.design = Design() map(self.design.addCollection, self.col_keys.iterkeys()) ## DEF def testCheckForInvalidKeys(self): d = self.designer.generate() self.assertIsNotNone(d) # Make sure that we don't have any invalid keys for col_name in d.getCollections(): for index_keys in d.getIndexes(col_name): for key in index_keys: assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ "Invalid index key '%s.%s'" % (col_name, key) ## FOR for key in d.getShardKeys(col_name): assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ "Invalid shard key '%s.%s'" % (col_name, key) ## FOR ## DEF def testSelectShardingKeys(self): # Select on set of keys at random and increase its occurence # in the histogram so that we will pick it expected = { } for col_name, h in self.col_keys.iteritems(): keys = random.choice(h.keys()) h.put(keys, 999999) expected[col_name] = keys self.designer.__selectShardingKeys__(self.design, self.col_keys) # Then check to make sure it picked what we expected it to for col_name in self.col_keys.iterkeys(): shard_keys = self.design.getShardKeys(col_name) self.assertIsNotNone(shard_keys) self.assertIsInstance(shard_keys, tuple) self.assertEquals(expected[col_name], shard_keys) #print self.design ## DEF def testSelectIndexKeys(self): # Select on set of keys at random and increase its occurence # in the histogram so that we will pick it expected = { } for col_name, h in self.col_keys.iteritems(): keys = random.choice(h.keys()) h.put(keys, 999999) expected[col_name] = keys node_memory = self.config.get(configutil.SECT_CLUSTER, "node_memory") self.designer.__selectIndexKeys__(self.design, self.col_keys, node_memory) #print self.design # Then check to make sure it picked what we expected it to for col_name in self.col_keys.iterkeys(): index_keys = self.design.getIndexKeys(col_name) self.assertIsNotNone(index_keys) self.assertIsInstance(index_keys, list)
def testGuessIndex_IndexSizeEstimation_Denormalization(self): """ If collection A is denormalized into B, then the index for collection B should have larger size now (If and only if the index is built on a field that is included by both collection A and collection B) """ d = Design() d.addCollection("apple") d.addCollection("microsoft") d.addCollection("google") d.addIndex("apple", ["field00"]) d.addIndex("microsoft", ["field00"]) d.addIndex("google", ["field00"]) # op4 use index (field00) but it only goes to collection microsoft op4 = self.ops[4] # Guess index # Without denormalization best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(d, op4) # With one denormalization d.setDenormalizationParent("apple", "microsoft") self.cm.buildEmbeddingCostDictionary(d) best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(d, op4) self.assertGreater(index_size_1, index_size_0) # With chained denormalization self.cm.reset() d.setDenormalizationParent("google", "apple") self.cm.buildEmbeddingCostDictionary(d) best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(d, op4) self.assertGreater(index_size_2, index_size_1)
def getManMadeDesign(self, denorm=True): # create a best design mannually d = Design() d.addCollection(tpccConstants.TABLENAME_ITEM) d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) d.addCollection(tpccConstants.TABLENAME_DISTRICT) d.addCollection(tpccConstants.TABLENAME_CUSTOMER) d.addCollection(tpccConstants.TABLENAME_STOCK) d.addCollection(tpccConstants.TABLENAME_ORDERS) d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID", "C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) return d
class TestNodeEstimator(unittest.TestCase): def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS + 1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design) ## DEF def testInitialize(self): """Check whether we can initialize the buffer properly for a design""" col_name = self.col_info['name'] self.assertIsNotNone(self.buffer.collection_sizes[col_name]) self.assertEqual(len(self.design.getIndexes(col_name)), len(self.buffer.index_sizes[col_name])) for indexKeys in self.design.getIndexes(col_name): self.assertIsNotNone(self.buffer.index_sizes[col_name][indexKeys]) self.buffer.validate() ## DEF def testInitializePreloading(self): """Check whether preloading the buffer works properly""" num_collections = 5 collections = dict() self.design = Design() for i in xrange(num_collections): col_name = "col%02d" % i col_info = catalog.Collection() col_info['name'] = col_name col_info['doc_count'] = NUM_DOCUMENTS col_info['workload_percent'] = 1 / float(num_collections) col_info['avg_doc_size'] = 1024 collections[col_name] = col_info self.design.addCollection(col_name) ## FOR self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True) try: self.buffer.initialize(self.design) self.buffer.validate() except: print self.buffer raise ## DEF def testReset(self): """Check whether the LRUBuffer will reset its internal state properly""" self.buffer.reset() self.assertEqual(BUFFER_SIZE, self.buffer.remaining) ## DEF def testComputeTupleHash(self): num_entries = 10000 rng = random.Random() rng.seed(self.__init__.im_class) for i in xrange(num_entries): # Construct a tuple and make sure that the size that we get out # of it is the size that we put into it typeId = rng.choice( [LRUBuffer.DOC_TYPE_COLLECTION, LRUBuffer.DOC_TYPE_INDEX]) key = rng.random() size = rng.randint(1, 8) * 1024 documentId = rng.random() buffer_tuple = self.buffer.__computeTupleHash__( typeId, key, size, documentId) self.assertIsNotNone(buffer_tuple) extracted = self.buffer.__getTupleSize__(buffer_tuple) self.assertEqual(size, extracted, pformat(locals( ))) # "BufferTuple: %d / ExpectedSize: %d" % (buffer_tuple, size)) ## FOR ## DEF def testGetDocumentFromCollection(self): """Check whether the LRUBuffer updates internal buffer for new collection documents""" documentId = 0 pageHits = 0 while self.buffer.remaining > self.col_info['avg_doc_size']: pageHits += self.buffer.getDocumentFromCollection( self.col_info['name'], documentId) before = self.buffer.remaining # If we insert the same document, we should not get any pageHits and our # remaining memory should be the same _pageHits = self.buffer.getDocumentFromCollection( self.col_info['name'], documentId) self.assertEqual(0, _pageHits) self.assertEqual(before, self.buffer.remaining) documentId += 1 self.buffer.validate() ## WHILE # We should only have one pageHit per document self.assertEqual(documentId, pageHits) # Make sure that the buffer is in the right order as we evict records lastDocId = None while len(self.buffer.buffer) > 0: evicted = self.buffer.evictNext(self.col_info['name']) self.assertIsNotNone(evicted) self.buffer.validate() # We can't check this anymore because it's faster for us # if we just store the hash of the tuple instead of the # actualy tuple values # if lastDocId: self.assertLess(lastDocId, docId) # lastDocId = docId ## WHILE self.assertEqual(BUFFER_SIZE, self.buffer.remaining) ## DEF def testGetDocumentFromIndex(self): """Check whether the LRUBuffer updates internal buffer for new index documents""" # Roll through each index and add a bunch of documents. Note that the documents # will have the same documentId, but they should be represented as separated objects # in the internal buffer (because they are for different indexes) documentId = 0 pageHits = 0 while not self.buffer.evicted: for indexKeys in self.design.getIndexes(COLLECTION_NAME): pageHits += self.buffer.getDocumentFromIndex( self.col_info['name'], indexKeys, documentId) before = self.buffer.remaining # If we insert the same document, we should not get any pageHits _pageHits = self.buffer.getDocumentFromIndex( self.col_info['name'], indexKeys, documentId) self.assertEqual(0, _pageHits) self.assertEqual(before, self.buffer.remaining) if self.buffer.evicted: break documentId += 1 self.buffer.validate() ## WHILE # Make sure that we get back two entries for each documentId (except for one) lastDocId = None # docIds_h = Histogram() while len(self.buffer.buffer) > 0: # typeId, key, docId = self.buffer.evictNext(COLLECTION_NAME) evicted = self.buffer.evictNext(COLLECTION_NAME) self.assertIsNotNone(evicted) self.buffer.validate() # self.assertIsNotNone(typeId) # self.assertIsNotNone(key) # self.assertIsNotNone(docId) # docIds_h.put(docId) ## WHILE # foundSingleDocId = False # for documentId,cnt in docIds_h.iteritems(): # if cnt == 1: # self.assertFalse(foundSingleDocId) # foundSingleDocId = True # else: # self.assertEqual(2, cnt) # ## FOR self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
def testDiskCostOnDifferentIndexes(self): """Check how indexes will affect the disk cost""" # 1. Put index on both of the fields seperately d = Design() d.addCollection(CostModelTestCase.COLLECTION_NAME) d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field00"]) d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01"]) self.cm.reset() self.cm.state.reset() cost0 = self.cm.getCost(d) print "diskCost0:", cost0 # 3. Put indexes on both field together d = Design() col_info = self.collections[CostModelTestCase.COLLECTION_NAME] d.addCollection(CostModelTestCase.COLLECTION_NAME) d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01", "field00"]) self.state.invalidateCache(col_info['name']) self.cm.reset() self.cm.state.reset() cost1 = self.cm.getCost(d) print "diskCost1:", cost1 self.assertGreater(cost0, cost1)
def testGuessIndex_IndexSizeEstimation_Denormalization(self): """ If collection A is denormalized into B, then the index for collection B should have larger size now (If and only if the index is built on a field that is included by both collection A and collection B) """ d = Design() d.addCollection("apple") d.addCollection("microsoft") d.addCollection("google") d.addIndex("apple", ["field00"]) d.addIndex("microsoft", ["field00"]) d.addIndex("google", ["field00"]) # op4 use index (field00) but it only goes to collection microsoft op4 = self.ops[4] # Guess index # Without denormalization best_index, covering, index_size_0, slot_size = self.cm.guess_op_info( d, op4) # With one denormalization d.setDenormalizationParent("apple", "microsoft") self.cm.buildEmbeddingCostDictionary(d) best_index, covering, index_size_1, slot_size = self.cm.guess_op_info( d, op4) self.assertGreater(index_size_1, index_size_0) # With chained denormalization self.cm.reset() d.setDenormalizationParent("google", "apple") self.cm.buildEmbeddingCostDictionary(d) best_index, covering, index_size_2, slot_size = self.cm.guess_op_info( d, op4) self.assertGreater(index_size_2, index_size_1)
class TestNodeEstimator(unittest.TestCase): def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS+1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design) ## DEF def testInitialize(self): """Check whether we can initialize the buffer properly for a design""" col_name = self.col_info['name'] self.assertIsNotNone(self.buffer.collection_sizes[col_name]) self.assertEqual(len(self.design.getIndexes(col_name)), len(self.buffer.index_sizes[col_name])) for indexKeys in self.design.getIndexes(col_name): self.assertIsNotNone(self.buffer.index_sizes[col_name][indexKeys]) self.buffer.validate() ## DEF def testInitializePreloading(self): """Check whether preloading the buffer works properly""" num_collections = 5 collections = dict() self.design = Design() for i in xrange(num_collections): col_name = "col%02d" % i col_info = catalog.Collection() col_info['name'] = col_name col_info['doc_count'] = NUM_DOCUMENTS col_info['workload_percent'] = 1 / float(num_collections) col_info['avg_doc_size'] = 1024 collections[col_name] = col_info self.design.addCollection(col_name) ## FOR self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True) try: self.buffer.initialize(self.design) self.buffer.validate() except: print self.buffer raise ## DEF def testReset(self): """Check whether the LRUBuffer will reset its internal state properly""" self.buffer.reset() self.assertEqual(BUFFER_SIZE, self.buffer.remaining) ## DEF def testComputeTupleHash(self): num_entries = 10000 rng = random.Random() rng.seed(self.__init__.im_class) for i in xrange(num_entries): # Construct a tuple and make sure that the size that we get out # of it is the size that we put into it typeId = rng.choice([LRUBuffer.DOC_TYPE_COLLECTION, LRUBuffer.DOC_TYPE_INDEX]) key = rng.random() size = rng.randint(1, 8) * 1024 documentId = rng.random() buffer_tuple = self.buffer.__computeTupleHash__(typeId, key, size, documentId) self.assertIsNotNone(buffer_tuple) extracted = self.buffer.__getTupleSize__(buffer_tuple) self.assertEqual(size, extracted, pformat(locals())) # "BufferTuple: %d / ExpectedSize: %d" % (buffer_tuple, size)) ## FOR ## DEF def testGetDocumentFromCollection(self): """Check whether the LRUBuffer updates internal buffer for new collection documents""" documentId = 0 pageHits = 0 while self.buffer.remaining > self.col_info['avg_doc_size']: pageHits += self.buffer.getDocumentFromCollection(self.col_info['name'], documentId) before = self.buffer.remaining # If we insert the same document, we should not get any pageHits and our # remaining memory should be the same _pageHits = self.buffer.getDocumentFromCollection(self.col_info['name'], documentId) self.assertEqual(0, _pageHits) self.assertEqual(before, self.buffer.remaining) documentId += 1 self.buffer.validate() ## WHILE # We should only have one pageHit per document self.assertEqual(documentId, pageHits) # Make sure that the buffer is in the right order as we evict records lastDocId = None while len(self.buffer.buffer) > 0: evicted = self.buffer.evictNext(self.col_info['name']) self.assertIsNotNone(evicted) self.buffer.validate() # We can't check this anymore because it's faster for us # if we just store the hash of the tuple instead of the # actualy tuple values # if lastDocId: self.assertLess(lastDocId, docId) # lastDocId = docId ## WHILE self.assertEqual(BUFFER_SIZE, self.buffer.remaining) ## DEF def testGetDocumentFromIndex(self): """Check whether the LRUBuffer updates internal buffer for new index documents""" # Roll through each index and add a bunch of documents. Note that the documents # will have the same documentId, but they should be represented as separated objects # in the internal buffer (because they are for different indexes) documentId = 0 pageHits = 0 while not self.buffer.evicted: for indexKeys in self.design.getIndexes(COLLECTION_NAME): pageHits += self.buffer.getDocumentFromIndex(self.col_info['name'], indexKeys, documentId) before = self.buffer.remaining # If we insert the same document, we should not get any pageHits _pageHits = self.buffer.getDocumentFromIndex(self.col_info['name'], indexKeys, documentId) self.assertEqual(0, _pageHits) self.assertEqual(before, self.buffer.remaining) if self.buffer.evicted: break documentId += 1 self.buffer.validate() ## WHILE # Make sure that we get back two entries for each documentId (except for one) lastDocId = None # docIds_h = Histogram() while len(self.buffer.buffer) > 0: # typeId, key, docId = self.buffer.evictNext(COLLECTION_NAME) evicted = self.buffer.evictNext(COLLECTION_NAME) self.assertIsNotNone(evicted) self.buffer.validate() # self.assertIsNotNone(typeId) # self.assertIsNotNone(key) # self.assertIsNotNone(docId) # docIds_h.put(docId) ## WHILE # foundSingleDocId = False # for documentId,cnt in docIds_h.iteritems(): # if cnt == 1: # self.assertFalse(foundSingleDocId) # foundSingleDocId = True # else: # self.assertEqual(2, cnt) # ## FOR self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
def testGuessIndex_indexChooseTheMostMatch(self): """ Design with index (field01, field00), (field01), 1. query uses index (field01) without projection field result: using index (field01) because they match the most 2. query used index (field01, field00) without projection field result: using index (field01, field00) because they match the most If we have a design building indexes on (field01) only 3. query uses index (field01, field00) without projection field result: using index (field01) because they match the most If we have a design building indexes on (field01, field03, field00), (field01) 4. query uses index (field01, field00) result: using index (field01) because field01 is shorter """ # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field00"]) d.addIndex("apple", ["field01"]) # query 1: get query op = self.ops[1] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 1) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering) # query 2: get query op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], 'field01') self.assertEqual(best_index[1], 'field00') self.assertFalse(covering) ## query 3: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering) # query 4: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field03", "field00"]) d.addIndex("apple", ["field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info( d, op) self.assertEqual(len(best_index), 1) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering)
def testNetworkCostDenormalization(self): """Check network cost for queries that reference denormalized collections""" # Get the "base" design cost when all of the collections # are sharded on their "interesting" fields d0 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d0.addCollection(col_info['name']) if i == 0: d0.addShardKey(col_info['name'], col_info['interesting']) else: d0.addShardKey(col_info['name'], ["_id"]) self.cm.invalidateCache(d0, col_info['name']) ## FOR self.cm.reset() self.state.reset() cost0 = self.cm.getCost(d0) print "cost0:", cost0 # Now get the network cost for when we denormalize the # second collection inside of the first one # We should have a lower cost because there should now be fewer queries d1 = Design() for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] self.assertTrue(col_info['interesting']) d1.addCollection(col_info['name']) if i == 0: d1.addShardKey(col_info['name'], col_info['interesting']) else: parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] self.assertIsNotNone(parent) d1.setDenormalizationParent(col_info['name'], parent['name']) self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name']) self.assertIsNotNone(d1.getDenormalizationParent(col_info['name'])) self.cm.invalidateCache(d1, col_info['name']) combiner = WorkloadCombiner(self.collections, self.workload) combinedWorkload = combiner.process(d1) self.state.updateWorkload(combinedWorkload) self.cm.reset() self.state.reset() cost1 = self.cm.getCost(d1) print "cost1:", cost1 self.assertLess(cost1, cost0) # The denormalization cost should also be the same as the cost # when we remove all of the ops one the second collection backup_collections = copy.deepcopy(self.collections) for sess in self.state.workload: for op in sess["operations"]: if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]: sess["operations"].remove(op) ## FOR (op) ## FOR (sess) for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)): del self.collections[CostModelTestCase.COLLECTION_NAMES[i]] print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i] self.cm.reset() self.state.reset() cost2 = self.cm.getCost(d1) print "cost2:", cost2 self.assertEqual(cost1, cost2) # Restore the original workload and see if the cost remains the same with the original one self.state.restoreOriginalWorkload() self.state.collections = backup_collections self.cm.reset() self.state.reset() cost3 = self.cm.getCost(d0) print "cost3:", cost3 self.assertEqual(cost3, cost0)
def testNetworkCostShouldReduceAfterQueryCombination(self): """ Network cost should be reduce after embedding collections """ d0 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d0.addCollection(col_info['name']) d0.addIndex(col_info['name'], ['field00', 'field02']) cost0 = self.cmn.getCost(d0) print "cost0 " + str(cost0) # Initialize a combiner combiner = WorkloadCombiner(self.col_names, self.workload) # initialize a design with denormalization d1 = Design() for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)): col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]] d1.addCollection(col_info['name']) d1.addIndex(col_info['name'], ['field00', 'field02']) self.state.invalidateCache(col_info['name']) d1.setDenormalizationParent("koalas", "apples") combinedWorkload = combiner.process(d1) self.state.updateWorkload(combinedWorkload) self.cmn.reset() self.cmn.state.reset() cost1 = self.cmn.getCost(d1) print "cost1 " + str(cost1) self.assertGreater(cost0, cost1) # Cost should remain the same after restoring the original workload self.state.restoreOriginalWorkload() self.cmn.reset() self.cmn.state.reset() cost2 = self.cmn.getCost(d0) print "cost2 " + str(cost2) self.assertEqual(cost2, cost0)
def testGuessIndex_indexInIncorrectOrder(self): """ Design with index (field01, field00) 1. query uses index (field00) result: not using index because that query uses indexes in order 2. query uses index (field01) result: using index (field01, field00) because this is the best match 3. query uses index (field01, field00) result: using index (field01, field00) because they match the best Design with index (field00, field01) 4. query uses index (field01, field00) result: using no index because the index order is not correct Design with index (field01, field02, field00) 5. query uses index (field01, field00) result: using index (field01, field02, field00) because they match the best result: not cover index because the index order in design is not correct """ # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field00"]) # query 1: get query, queries on field00 op = self.ops[0] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(best_index, None) self.assertFalse(covering) # query 2: get query, queries on field01 op = self.ops[1] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field00") self.assertFalse(covering) # query 3: get query, queries on field01 and field00 op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field00") self.assertFalse(covering) # query 4: d = Design() d.addCollection("apple") d.addIndex("apple", ["field00", "field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 2) self.assertFalse(covering) # query 5: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field02", "field00"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 3) self.assertEqual(best_index[0], "field01") self.assertEqual(best_index[1], "field02") self.assertEqual(best_index[2], "field00") self.assertFalse(covering)
class TestInitialDesigner(TPCCTestCase): def setUp(self): TPCCTestCase.setUp(self) self.config = configutil.makeDefaultConfig() self.designer = InitialDesigner(self.collections, self.workload, self.config) self.col_keys = self.designer.generateCollectionHistograms() self.design = Design() map(self.design.addCollection, self.col_keys.iterkeys()) ## DEF def testCheckForInvalidKeys(self): d = self.designer.generate() self.assertIsNotNone(d) # Make sure that we don't have any invalid keys for col_name in d.getCollections(): for index_keys in d.getIndexes(col_name): for key in index_keys: assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ "Invalid index key '%s.%s'" % (col_name, key) ## FOR for key in d.getShardKeys(col_name): assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ "Invalid shard key '%s.%s'" % (col_name, key) ## FOR ## DEF def testSelectShardingKeys(self): # Select on set of keys at random and increase its occurence # in the histogram so that we will pick it expected = {} for col_name, h in self.col_keys.iteritems(): keys = random.choice(h.keys()) h.put(keys, 999999) expected[col_name] = keys self.designer.__selectShardingKeys__(self.design, self.col_keys) # Then check to make sure it picked what we expected it to for col_name in self.col_keys.iterkeys(): shard_keys = self.design.getShardKeys(col_name) self.assertIsNotNone(shard_keys) self.assertIsInstance(shard_keys, tuple) self.assertEquals(expected[col_name], shard_keys) #print self.design ## DEF def testSelectIndexKeys(self): # Select on set of keys at random and increase its occurence # in the histogram so that we will pick it expected = {} for col_name, h in self.col_keys.iteritems(): keys = random.choice(h.keys()) h.put(keys, 999999) expected[col_name] = keys node_memory = self.config.get(configutil.SECT_CLUSTER, "node_memory") self.designer.__selectIndexKeys__(self.design, self.col_keys, node_memory) #print self.design # Then check to make sure it picked what we expected it to for col_name in self.col_keys.iterkeys(): index_keys = self.design.getIndexKeys(col_name) self.assertIsNotNone(index_keys) self.assertIsInstance(index_keys, list)
def testGuessIndex_indexChooseTheMostMatch(self): """ Design with index (field01, field00), (field01), 1. query uses index (field01) without projection field result: using index (field01) because they match the most 2. query used index (field01, field00) without projection field result: using index (field01, field00) because they match the most If we have a design building indexes on (field01) only 3. query uses index (field01, field00) without projection field result: using index (field01) because they match the most If we have a design building indexes on (field01, field03, field00), (field01) 4. query uses index (field01, field00) result: using index (field01) because field01 is shorter """ # initialize design d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field00"]) d.addIndex("apple", ["field01"]) # query 1: get query op = self.ops[1] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 1) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering) # query 2: get query op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 2) self.assertEqual(best_index[0], 'field01') self.assertEqual(best_index[1], 'field00') self.assertFalse(covering) ## query 3: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering) # query 4: d = Design() d.addCollection("apple") d.addIndex("apple", ["field01", "field03", "field00"]) d.addIndex("apple", ["field01"]) op = self.ops[2] # Guess index best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op) self.assertEqual(len(best_index), 1) self.assertEqual(best_index[0], 'field01') self.assertFalse(covering)
def getManMadeDesign(self, denorm=True): # create a best design mannually d = Design() d.addCollection(tpccConstants.TABLENAME_ITEM) d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) d.addCollection(tpccConstants.TABLENAME_DISTRICT) d.addCollection(tpccConstants.TABLENAME_CUSTOMER) d.addCollection(tpccConstants.TABLENAME_STOCK) d.addCollection(tpccConstants.TABLENAME_ORDERS) d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) return d
def testNetworkCost(self): """Check network cost for equality predicate queries""" col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] self.assertTrue(col_info['interesting']) # If we shard the collection on the interesting fields, then # each query should only need to touch one node d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], col_info['interesting']) cost0 = self.cm.getCost(d) print "cost0: ", cost0 # If we now shard the collection on just '_id', then every query # should have to touch every node. The cost of this design # should be greater than the first one d = Design() d.addCollection(col_info['name']) d.addShardKey(col_info['name'], ['_id']) self.cm.invalidateCache(d, col_info['name']) self.state.reset() cost1 = self.cm.getCost(d) print "cost1: ", cost1 self.assertLess(cost0, cost1)