def testDiskCostIndexes(self):
        """Check whether disk cost calculations work correctly"""
        # First get the disk cost when there are no indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])

        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # The cost should be exactly equal to one, which means that every operation
        # has to perform a full sequential scan on the collection
        self.assertEqual(cost0, 1.0)

        # Now add the all indexes. The disk cost should be lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
        self.assertGreater(cost0, cost1)
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
Beispiel #3
0
    def testInitializePreloading(self):
        """Check whether preloading the buffer works properly"""

        num_collections = 5
        collections = dict()
        self.design = Design()
        for i in xrange(num_collections):
            col_name = "col%02d" % i
            col_info = catalog.Collection()
            col_info['name'] = col_name
            col_info['doc_count'] = NUM_DOCUMENTS
            col_info['workload_percent'] = 1 / float(num_collections)
            col_info['avg_doc_size'] = 1024
            collections[col_name] = col_info
            self.design.addCollection(col_name)
        ## FOR

        self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True)

        try:
            self.buffer.initialize(self.design)
            self.buffer.validate()
        except:
            print self.buffer
            raise
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)

        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
    def testEstimateNodesRange(self):
        """Check the estimating touched nodes for a range predicate op"""

        col_info = self.collections[COLLECTION_NAMES[0]]
        shard_key = col_info["interesting"][0]
        col_info["fields"][shard_key]["selectivity"] = 0.5

        d = Design()
        d.addCollection(col_info["name"])
        d.addShardKey(col_info["name"], [shard_key])

        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        op["query_content"] = [
            {
                constants.REPLACE_KEY_DOLLAR_PREFIX
                + "query": {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX + "gt": 10000L}}
            }
        ]
        op["predicates"] = {shard_key: constants.PRED_TYPE_RANGE}

        # The list estimated touched nodes should contain more than one entry
        touched0 = list(self.estimator.estimateNodes(d, op))
        print "touched0:", touched0
        self.assertGreater(len(touched0), 1)
Beispiel #6
0
 def setUp(self):
     TPCCTestCase.setUp(self)
     self.config = configutil.makeDefaultConfig()
     self.designer = InitialDesigner(self.collections, self.workload,
                                     self.config)
     self.col_keys = self.designer.generateCollectionHistograms()
     self.design = Design()
     map(self.design.addCollection, self.col_keys.iterkeys())
    def testSameDesignExecutedTwice_withemptydesign(self):
        """
            If the same design is executed twice, they should have the same result
        """
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)

        ## for 
        cost0 = self.cm.overallCost(d)
        cost1 = self.cm.overallCost(d)

        self.assertEqual(cost0, cost1)
Beispiel #8
0
    def testSameDesignExecutedTwice_withemptydesign(self):
        """
            If the same design is executed twice, they should have the same result
        """
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)

        ## for
        cost0 = self.cm.overallCost(d)
        cost1 = self.cm.overallCost(d)

        self.assertEqual(cost0, cost1)
Beispiel #9
0
    def testGuessIndex_indexChooseWithProjectionField(self):
        """
            If a query uses one of the indexes the design has but its projection uses
            one of the indexes the design has, we should choose the index with both
            query index and projection index
        """
        # If we have a design with index (field00), (field00, field02)
        # 1. query uses field00 but its projection field is {field02: xx}
        # result: we should choose (field00, field02) as the best index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field02"])
        d.addIndex("apple", ["field00"])

        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field02")
        self.assertTrue(covering)
    def testDiskCostNotChangedAfterQueryCombination(self):
        """Disk cost should not be changed after query combination"""
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
        
        cost0 = self.cm.getCost(d)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
            self.state.invalidateCache(col_name)
            
        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)

        print "cost1 " + str(cost1)
        
        self.assertEqual(cost0, cost1)
    def testNotCollectionEmbeddingProcessShouldReturnNone(self):
        """
            If the given design has no collection embedding, we should return right away
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        combinedWorkload = combiner.process(d0)
        self.assertEqual(None, combinedWorkload)
    def testSkewCost(self):
        """Check whether skew cost calculations work correctly"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        # First get the skew cost when the queries got each node uniformly
        # This is the best-case scenario
        op_ctr = 0
        for sess in self.workload:
            for op in sess['operations']:
                query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                           {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\
                } ]
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
                op_ctr += 1
            ## FOR (op)
        ## FOR (session)

        col_info["fields"][shard_key]["ranges"] = range(CostModelTestCase.NUM_NODES)

        cost0 = self.cm.getCost(d)
        self.assertLessEqual(cost0, 1.0)
        #        print "skewCost0:", cost0

        # Then make all of the operations go to a single node
        # This is the worst-case scenario
        query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                                   {shard_key: 1000l }\
        } ]
        for sess in self.workload:
            for op in sess['operations']:
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
            ## FOR
        self.state.reset()
        self.cm.reset()
        cost1 = self.cm.getCost(d)
        self.assertLessEqual(cost1, 1.0)
        #        print "skewCost1:", cost1

        self.assertGreater(cost1, cost0)
Beispiel #13
0
    def testInitializePreloading(self):
        """Check whether preloading the buffer works properly"""

        num_collections = 5
        collections = dict()
        self.design = Design()
        for i in xrange(num_collections):
            col_name = "col%02d" % i
            col_info = catalog.Collection()
            col_info['name'] = col_name
            col_info['doc_count'] = NUM_DOCUMENTS
            col_info['workload_percent'] = 1 / float(num_collections)
            col_info['avg_doc_size'] = 1024
            collections[col_name] = col_info
            self.design.addCollection(col_name)
        ## FOR

        self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True)

        try:
            self.buffer.initialize(self.design)
            self.buffer.validate()
        except:
            print self.buffer
            raise
 def setUp(self):
     TPCCTestCase.setUp(self)
     self.config = configutil.makeDefaultConfig()
     self.designer = InitialDesigner(self.collections, self.workload, self.config)
     self.col_keys = self.designer.generateCollectionHistograms()
     self.design = Design()
     map(self.design.addCollection, self.col_keys.iterkeys())
 def testDenormalizer(self):
     d = Design()
     for col_name in self.col_names:
         d.addCollection(col_name)
     ## FOR
     op_list = self.printOperations()
     col_list = self.printAllCollections()
     d.setDenormalizationParent("koalas", "apples")
     
     dn = Denormalizer(self.metadata_db, self.dataset_db, d)
     dn.process()
     
     new_op_list = self.printOperations()
     new_col_list = self.printAllCollections()
     
     self.assertTrue("koalas" not in new_op_list)
     self.assertTrue("koalas" not in new_col_list)
    def testGuessIndex_IndexSizeEstimation(self):
        """
            Check if the size of the indexes vary
        """
        d = Design()
        d.addCollection("apple")
        
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])
        d.addIndex("apple", ["field00", "field01"])
        
        # op0 use index (field00)
        op0 = self.ops[0]
        
        # op1 use index (field01)
        op1 = self.ops[1]
        
        # op2 use index (field01, field00)
        op2 = self.ops[2]
        
        # op3 use index (field00, field01)
        op3 = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op0)
        self.assertEqual(24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op1)
        self.assertEqual(24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op2)
        self.assertEqual(24+24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op3)
        self.assertEqual(24+24+8, index_size)
    def testGuessIndex_indexChooseWithProjectionField(self):
        """
            If a query uses one of the indexes the design has but its projection uses
            one of the indexes the design has, we should choose the index with both
            query index and projection index
        """
        # If we have a design with index (field00), (field00, field02)
        # 1. query uses field00 but its projection field is {field02: xx}
        # result: we should choose (field00, field02) as the best index
        
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field02"])
        d.addIndex("apple", ["field00"])

        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field02")
        self.assertTrue(covering)
    def testEstimateNodesNullValue(self):
        """Check the estimating touched nodes when the sharding key value is null"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info["name"])
            # This key won't be in the operation's fields, but we should still
            # be able to get back a value
            d.addShardKey(col_info["name"], ["XXXX"])
            ## FOR

        # A query that looks up on a non-sharding key should always be
        # broadcast to every node
        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        touched0 = list(self.estimator.estimateNodes(d, op))
        #        print "touched0:", touched0
        self.assertListEqual(range(NUM_NODES), touched0)

        # But if we insert into that collection with a document that doesn't
        # have the sharding key, it should only go to one node
        op["type"] = constants.OP_TYPE_INSERT
        op["query_content"] = op["resp_content"]
        op["predicates"] = []
        #        pprint(op)
        touched1 = list(self.estimator.estimateNodes(d, op))
        #        print "touched1:", touched1
        self.assertEqual(1, len(touched1))

        # And if we insert another one, then we should get the same value back
        op = Session.operationFactory()
        op["collection"] = COLLECTION_NAMES[0]
        op["type"] = constants.OP_TYPE_INSERT
        op["query_id"] = 10000
        op["query_content"] = [{"parkinglot": 1234}]
        op["resp_content"] = [{"ok": 1}]
        op["resp_id"] = 10001
        #        pprint(op)
        touched2 = list(self.estimator.estimateNodes(d, op))
        self.assertEqual(1, len(touched2))
        self.assertListEqual(touched1, touched2)
    def testEstimateNodesEquality(self):
        """Check the estimating touched nodes for a equality predicate op"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info["name"])
            # Only put the first field in the interesting list as the sharding key
            # We'll worry about compound sharding keys later.
            d.addShardKey(col_info["name"], col_info["interesting"][:1])
        ## FOR

        sess = self.metadata_db.Session.fetch_one()
        op = sess["operations"][0]
        #        pprint(op)

        # If we execute it twice, we should get back the exact same node ids
        touched0 = list(self.estimator.estimateNodes(d, op))
        touched1 = list(self.estimator.estimateNodes(d, op))
        self.assertListEqual(touched0, touched1)
Beispiel #20
0
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS + 1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'],
                             self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info},
                                BUFFER_SIZE)
        self.buffer.initialize(self.design)
Beispiel #21
0
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1

        print "number of queries after query combination: " + str(
            number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries,
                           number_of_queries_from_combined_workload)
    def testGuessIndex_indexChooseWithoutProjectionField(self):
        """
            If a query uses all the indexes but doesn't have a projection field,
            we still think it is not a covering index
        """
        # If we have a design with indexes(field00, field01)
        # 1. query uses (field00, field01) but there is no projection field
        # result: we should choose (field00, field02) but the index is not a covering index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field01")
        self.assertFalse(covering)
Beispiel #23
0
    def testGuessIndex_IndexSizeEstimation(self):
        """
            Check if the size of the indexes vary
        """
        d = Design()
        d.addCollection("apple")

        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])
        d.addIndex("apple", ["field00", "field01"])

        # op0 use index (field00)
        op0 = self.ops[0]

        # op1 use index (field01)
        op1 = self.ops[1]

        # op2 use index (field01, field00)
        op2 = self.ops[2]

        # op3 use index (field00, field01)
        op3 = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op0)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op1)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op2)
        self.assertEqual(24 + 24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op3)
        self.assertEqual(24 + 24 + 8, index_size)
Beispiel #24
0
    def testNotCollectionEmbeddingProcessShouldReturnNone(self):
        """
            If the given design has no collection embedding, we should return right away
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        combinedWorkload = combiner.process(d0)
        self.assertEqual(None, combinedWorkload)
Beispiel #25
0
    def testGuessIndex_consistentAnswer(self):
        """Check that guessIndex always returns the same answer for the same input"""

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])

        for i in xrange(len(self.ops) - 2):
            op = self.ops[i]
            last_index, last_covering = (None, None)
            for i in xrange(100):
                best_index, covering, index_size, slot_size = self.cm.guess_op_info(
                    d, op)
                self.assertIsNotNone(best_index)
                self.assertIsNotNone(covering)
                if not last_index is None:
                    self.assertEqual(last_index, best_index)
                    self.assertEqual(last_covering, covering)
                last_index, last_covering = (best_index, covering)
Beispiel #26
0
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS+1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE)
        self.buffer.initialize(self.design)
    def testDiskCostCaching(self):
        """Check whether disk cost calculations work correctly with caching enabled"""
        self.cm.cache_enable = True

        # Give the mofo a full Design with indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
            ## FOR
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # FIXME self.assertGreater(cost0, 0.0)

        # We should get the same cost back after we execute it a second time
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
    def testDenormalizer(self):
        d = Design()
        for col_name in self.col_names:
            d.addCollection(col_name)
        ## FOR
        op_list = self.printOperations()
        col_list = self.printAllCollections()
        d.setDenormalizationParent("koalas", "apples")

        dn = Denormalizer(self.metadata_db, self.dataset_db, d)
        dn.process()

        new_op_list = self.printOperations()
        new_col_list = self.printAllCollections()

        self.assertTrue("koalas" not in new_op_list)
        self.assertTrue("koalas" not in new_col_list)
    def testSkewCost(self):
        """Check whether skew cost calculations work correctly"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        # First get the skew cost when the queries got each node uniformly
        # This is the best-case scenario
        op_ctr = 0
        for sess in self.workload:
            for op in sess['operations']:
                query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                           {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\
                } ]
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
                op_ctr += 1
            ## FOR (op)
        ## FOR (session)
        cost0 = self.cm.getCost(d)
        self.assertLessEqual(cost0, 1.0)
        #        print "skewCost0:", cost0

        # Then make all of the operations go to a single node
        # This is the worst-case scenario
        query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                                   {shard_key: 1000l }\
        } ]
        for sess in self.workload:
            for op in sess['operations']:
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
            ## FOR
        self.state.reset()
        self.cm.reset()
        cost1 = self.cm.getCost(d)
        self.assertLessEqual(cost1, 1.0)
        #        print "skewCost1:", cost1

        self.assertGreater(cost1, cost0)
    def testGuessIndex_consistentAnswer(self):
        """Check that guessIndex always returns the same answer for the same input"""

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])

        for i in xrange(len(self.ops) - 2):
            op = self.ops[i]
            last_index, last_covering = (None, None)
            for i in xrange(100):
                best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)
                self.assertIsNotNone(best_index)
                self.assertIsNotNone(covering)
                if not last_index is None:
                    self.assertEqual(last_index, best_index)
                    self.assertEqual(last_covering, covering)
                last_index, last_covering = (best_index, covering)
    def testEstimateNodesNullValue(self):
        """Check the estimating touched nodes when the sharding key value is null"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # This key won't be in the operation's fields, but we should still
            # be able to get back a value
            d.addShardKey(col_info['name'], ['XXXX'])
            ## FOR

        # A query that looks up on a non-sharding key should always be
        # broadcast to every node
        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        touched0 = list(self.estimator.estimateNodes(d, op))
        #        print "touched0:", touched0
        self.assertListEqual(range(NUM_NODES), touched0)

        # But if we insert into that collection with a document that doesn't
        # have the sharding key, it should only go to one node
        op['type'] = constants.OP_TYPE_INSERT
        op['query_content'] = op['resp_content']
        op['predicates'] = []
        #        pprint(op)
        touched1 = list(self.estimator.estimateNodes(d, op))
        #        print "touched1:", touched1
        self.assertEqual(1, len(touched1))

        # And if we insert another one, then we should get the same value back
        op = Session.operationFactory()
        op['collection'] = COLLECTION_NAMES[0]
        op['type'] = constants.OP_TYPE_INSERT
        op['query_id'] = 10000
        op['query_content'] = [{"parkinglot": 1234}]
        op['resp_content'] = [{"ok": 1}]
        op['resp_id'] = 10001
        #        pprint(op)
        touched2 = list(self.estimator.estimateNodes(d, op))
        self.assertEqual(1, len(touched2))
        self.assertListEqual(touched1, touched2)
    def testEstimateNodesEquality(self):
        """Check the estimating touched nodes for a equality predicate op"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # Only put the first field in the interesting list as the sharding key
            # We'll worry about compound sharding keys later.
            d.addShardKey(col_info['name'], col_info['interesting'][:1])
        ## FOR

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        #        pprint(op)

        # If we execute it twice, we should get back the exact same node ids
        touched0 = list(self.estimator.estimateNodes(d, op))
        touched1 = list(self.estimator.estimateNodes(d, op))
        self.assertListEqual(touched0, touched1)
    def testEstimateNodesRange(self):
        """Check the estimating touched nodes for a range predicate op"""

        col_info = self.collections[COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]
        col_info['fields'][shard_key]['selectivity'] = 0.5

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        op['query_content'] = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": \
                {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX+"gt": 10000l} } \
        } ]
        op['predicates'] = {shard_key: constants.PRED_TYPE_RANGE}

        # The list estimated touched nodes should contain more than one entry
        touched0 = list(self.estimator.estimateNodes(d, op))
        print "touched0:", touched0
        self.assertGreater(len(touched0), 1)
Beispiel #34
0
    def testGuessIndex_indexChooseWithoutProjectionField(self):
        """
            If a query uses all the indexes but doesn't have a projection field,
            we still think it is not a covering index
        """
        # If we have a design with indexes(field00, field01)
        # 1. query uses (field00, field01) but there is no projection field
        # result: we should choose (field00, field02) but the index is not a covering index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field01")
        self.assertFalse(covering)
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        
        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])
            
        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)
        
        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Beispiel #36
0
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Beispiel #37
0
    def testGuessIndex_indexInIncorrectOrder(self):
        """
            Design with index (field01, field00)
            1. query uses index (field00)
            result: not using index because that query uses indexes in order
            2. query uses index (field01)
            result: using index (field01, field00) because this is the best match
            3. query uses index (field01, field00)
            result: using index (field01, field00) because they match the best

            Design with index (field00, field01)
            4. query uses index (field01, field00)
            result: using no index because the index order is not correct

            Design with index (field01, field02, field00)
            5. query uses index (field01, field00)
            result: using index (field01, field02, field00) because they match the best
            result: not cover index because the index order in design is not correct
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])

        # query 1: get query, queries on field00
        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index, None)
        self.assertFalse(covering)

        # query 2: get query, queries on field01
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 3: get query, queries on field01 and field00
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertFalse(covering)

        # query 5:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field02", "field00"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 3)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field02")
        self.assertEqual(best_index[2], "field00")
        self.assertFalse(covering)
Beispiel #38
0
 def testNetworkCostDenormalization(self):
     """Check network cost for queries that reference denormalized collections"""
     # Get the "base" design cost when all of the collections
     # are sharded on their "interesting" fields
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
     ## FOR
     self.cm.reset()
     self.state.reset()
     cost0 = self.cm.getCost(d)
     
     print "cost0:", cost0
     
     # Now get the network cost for when we denormalize the
     # second collection inside of the first one
     # We should have a lower cost because there should now be fewer queries
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         self.assertTrue(col_info['interesting'])
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
         
     d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)
        
     combiner = WorkloadCombiner(self.collections, self.workload)
     combinedWorkload = combiner.process(d)
     self.state.updateWorkload(combinedWorkload)
     
     self.cm.reset()
     self.state.reset()
     cost1 = self.cm.getCost(d)
     print "cost1:", cost1
    
     self.assertLess(cost1, cost0)
class TestInitialDesigner(TPCCTestCase):

    def setUp(self):
        TPCCTestCase.setUp(self)
        self.config = configutil.makeDefaultConfig()
        self.designer = InitialDesigner(self.collections, self.workload, self.config)
        self.col_keys = self.designer.generateCollectionHistograms()
        self.design = Design()
        map(self.design.addCollection, self.col_keys.iterkeys())
    ## DEF
    
    def testCheckForInvalidKeys(self):
        d = self.designer.generate()
        self.assertIsNotNone(d)
        
        # Make sure that we don't have any invalid keys
        for col_name in d.getCollections():
            for index_keys in d.getIndexes(col_name):
                for key in index_keys:
                    assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                        "Invalid index key '%s.%s'" % (col_name, key)
                ## FOR
            for key in d.getShardKeys(col_name):
                assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                    "Invalid shard key '%s.%s'" % (col_name, key)
        ## FOR
    ## DEF

    def testSelectShardingKeys(self):
        # Select on set of keys at random and increase its occurence
        # in the histogram so that we will pick it
        expected = { }
        for col_name, h in self.col_keys.iteritems():
            keys = random.choice(h.keys())
            h.put(keys, 999999)
            expected[col_name] = keys
        
        self.designer.__selectShardingKeys__(self.design, self.col_keys)
        
        # Then check to make sure it picked what we expected it to
        for col_name in self.col_keys.iterkeys():
            shard_keys = self.design.getShardKeys(col_name)
            self.assertIsNotNone(shard_keys)
            self.assertIsInstance(shard_keys, tuple)
            self.assertEquals(expected[col_name], shard_keys)
        #print self.design
    ## DEF
    
    def testSelectIndexKeys(self):
        # Select on set of keys at random and increase its occurence
        # in the histogram so that we will pick it
        expected = { }
        for col_name, h in self.col_keys.iteritems():
            keys = random.choice(h.keys())
            h.put(keys, 999999)
            expected[col_name] = keys
        
        node_memory = self.config.get(configutil.SECT_CLUSTER, "node_memory")
        self.designer.__selectIndexKeys__(self.design, self.col_keys, node_memory)
        #print self.design
        
        # Then check to make sure it picked what we expected it to
        for col_name in self.col_keys.iterkeys():
            index_keys = self.design.getIndexKeys(col_name)
            self.assertIsNotNone(index_keys)
            self.assertIsInstance(index_keys, list)
 def testGuessIndex_IndexSizeEstimation_Denormalization(self):
     """
         If collection A is denormalized into B, then the index for collection B should have larger size now
         (If and only if the index is built on a field that is included by both collection A and collection B)
     """
     d = Design()
     d.addCollection("apple")
     d.addCollection("microsoft")
     d.addCollection("google")
     
     d.addIndex("apple", ["field00"])
     d.addIndex("microsoft", ["field00"])
     d.addIndex("google", ["field00"])
     
     # op4 use index (field00) but it only goes to collection microsoft
     op4 = self.ops[4]
     
     # Guess index
     
     # Without denormalization
     best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(d, op4)
     
     # With one denormalization
     d.setDenormalizationParent("apple", "microsoft")
     self.cm.buildEmbeddingCostDictionary(d)
     best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(d, op4)
     
     self.assertGreater(index_size_1, index_size_0)
     
     # With chained denormalization
     self.cm.reset()
     d.setDenormalizationParent("google", "apple")
     self.cm.buildEmbeddingCostDictionary(d)
     best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(d, op4)
     
     self.assertGreater(index_size_2, index_size_1)
    def getManMadeDesign(self, denorm=True):
        # create a best design mannually

        d = Design()
        d.addCollection(tpccConstants.TABLENAME_ITEM)
        d.addCollection(tpccConstants.TABLENAME_WAREHOUSE)
        d.addCollection(tpccConstants.TABLENAME_DISTRICT)
        d.addCollection(tpccConstants.TABLENAME_CUSTOMER)
        d.addCollection(tpccConstants.TABLENAME_STOCK)
        d.addCollection(tpccConstants.TABLENAME_ORDERS)
        d.addCollection(tpccConstants.TABLENAME_NEW_ORDER)
        d.addCollection(tpccConstants.TABLENAME_ORDER_LINE)

        d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"])
        d.addIndex(tpccConstants.TABLENAME_CUSTOMER,
                   ["C_W_ID", "C_D_ID", "C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_ID"])
        d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"])
        d.addIndex(tpccConstants.TABLENAME_NEW_ORDER,
                   ["NO_W_ID", "NO_D_ID", "NO_O_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDER_LINE,
                   ["OL_W_ID", "OL_D_ID", "OL_O_ID"])

        d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"])

        return d
Beispiel #42
0
class TestNodeEstimator(unittest.TestCase):
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS + 1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'],
                             self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info},
                                BUFFER_SIZE)
        self.buffer.initialize(self.design)

    ## DEF

    def testInitialize(self):
        """Check whether we can initialize the buffer properly for a design"""
        col_name = self.col_info['name']
        self.assertIsNotNone(self.buffer.collection_sizes[col_name])
        self.assertEqual(len(self.design.getIndexes(col_name)),
                         len(self.buffer.index_sizes[col_name]))
        for indexKeys in self.design.getIndexes(col_name):
            self.assertIsNotNone(self.buffer.index_sizes[col_name][indexKeys])
        self.buffer.validate()

    ## DEF

    def testInitializePreloading(self):
        """Check whether preloading the buffer works properly"""

        num_collections = 5
        collections = dict()
        self.design = Design()
        for i in xrange(num_collections):
            col_name = "col%02d" % i
            col_info = catalog.Collection()
            col_info['name'] = col_name
            col_info['doc_count'] = NUM_DOCUMENTS
            col_info['workload_percent'] = 1 / float(num_collections)
            col_info['avg_doc_size'] = 1024
            collections[col_name] = col_info
            self.design.addCollection(col_name)
        ## FOR

        self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True)

        try:
            self.buffer.initialize(self.design)
            self.buffer.validate()
        except:
            print self.buffer
            raise

    ## DEF

    def testReset(self):
        """Check whether the LRUBuffer will reset its internal state properly"""
        self.buffer.reset()
        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)

    ## DEF

    def testComputeTupleHash(self):
        num_entries = 10000
        rng = random.Random()
        rng.seed(self.__init__.im_class)
        for i in xrange(num_entries):
            # Construct a tuple and make sure that the size that we get out
            # of it is the size that we put into it
            typeId = rng.choice(
                [LRUBuffer.DOC_TYPE_COLLECTION, LRUBuffer.DOC_TYPE_INDEX])
            key = rng.random()
            size = rng.randint(1, 8) * 1024
            documentId = rng.random()
            buffer_tuple = self.buffer.__computeTupleHash__(
                typeId, key, size, documentId)
            self.assertIsNotNone(buffer_tuple)

            extracted = self.buffer.__getTupleSize__(buffer_tuple)
            self.assertEqual(size, extracted, pformat(locals(
            )))  # "BufferTuple: %d / ExpectedSize: %d" % (buffer_tuple, size))
        ## FOR

    ## DEF

    def testGetDocumentFromCollection(self):
        """Check whether the LRUBuffer updates internal buffer for new collection documents"""

        documentId = 0
        pageHits = 0
        while self.buffer.remaining > self.col_info['avg_doc_size']:
            pageHits += self.buffer.getDocumentFromCollection(
                self.col_info['name'], documentId)
            before = self.buffer.remaining

            # If we insert the same document, we should not get any pageHits and our
            # remaining memory should be the same
            _pageHits = self.buffer.getDocumentFromCollection(
                self.col_info['name'], documentId)
            self.assertEqual(0, _pageHits)
            self.assertEqual(before, self.buffer.remaining)

            documentId += 1
            self.buffer.validate()
        ## WHILE

        # We should only have one pageHit per document
        self.assertEqual(documentId, pageHits)

        # Make sure that the buffer is in the right order as we evict records
        lastDocId = None
        while len(self.buffer.buffer) > 0:
            evicted = self.buffer.evictNext(self.col_info['name'])
            self.assertIsNotNone(evicted)
            self.buffer.validate()

            # We can't check this anymore because it's faster for us
            # if we just store the hash of the tuple instead of the
            # actualy tuple values
            # if lastDocId: self.assertLess(lastDocId, docId)
            # lastDocId = docId
        ## WHILE
        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)

    ## DEF

    def testGetDocumentFromIndex(self):
        """Check whether the LRUBuffer updates internal buffer for new index documents"""

        # Roll through each index and add a bunch of documents. Note that the documents
        # will have the same documentId, but they should be represented as separated objects
        # in the internal buffer (because they are for different indexes)
        documentId = 0
        pageHits = 0
        while not self.buffer.evicted:
            for indexKeys in self.design.getIndexes(COLLECTION_NAME):
                pageHits += self.buffer.getDocumentFromIndex(
                    self.col_info['name'], indexKeys, documentId)
                before = self.buffer.remaining

                # If we insert the same document, we should not get any pageHits
                _pageHits = self.buffer.getDocumentFromIndex(
                    self.col_info['name'], indexKeys, documentId)
                self.assertEqual(0, _pageHits)
                self.assertEqual(before, self.buffer.remaining)

                if self.buffer.evicted: break
            documentId += 1
            self.buffer.validate()
        ## WHILE

        # Make sure that we get back two entries for each documentId (except for one)
        lastDocId = None
        #        docIds_h = Histogram()
        while len(self.buffer.buffer) > 0:
            #            typeId, key, docId = self.buffer.evictNext(COLLECTION_NAME)
            evicted = self.buffer.evictNext(COLLECTION_NAME)
            self.assertIsNotNone(evicted)
            self.buffer.validate()
#            self.assertIsNotNone(typeId)
#            self.assertIsNotNone(key)
#            self.assertIsNotNone(docId)
#            docIds_h.put(docId)
## WHILE

#        foundSingleDocId = False
#        for documentId,cnt in docIds_h.iteritems():
#            if cnt == 1:
#                self.assertFalse(foundSingleDocId)
#                foundSingleDocId = True
#            else:
#                self.assertEqual(2, cnt)
#        ## FOR

        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
    def testDiskCostOnDifferentIndexes(self):
        """Check how indexes will affect the disk cost"""
        # 1. Put index on both of the fields seperately
        d = Design()
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field00"])
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01"])

        self.cm.reset()
        self.cm.state.reset()
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0

        # 3. Put indexes on both field together
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01", "field00"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1

        self.assertGreater(cost0, cost1)
Beispiel #44
0
    def testGuessIndex_IndexSizeEstimation_Denormalization(self):
        """
            If collection A is denormalized into B, then the index for collection B should have larger size now
            (If and only if the index is built on a field that is included by both collection A and collection B)
        """
        d = Design()
        d.addCollection("apple")
        d.addCollection("microsoft")
        d.addCollection("google")

        d.addIndex("apple", ["field00"])
        d.addIndex("microsoft", ["field00"])
        d.addIndex("google", ["field00"])

        # op4 use index (field00) but it only goes to collection microsoft
        op4 = self.ops[4]

        # Guess index

        # Without denormalization
        best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(
            d, op4)

        # With one denormalization
        d.setDenormalizationParent("apple", "microsoft")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_1, index_size_0)

        # With chained denormalization
        self.cm.reset()
        d.setDenormalizationParent("google", "apple")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_2, index_size_1)
Beispiel #45
0
class TestNodeEstimator(unittest.TestCase):

    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS+1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE)
        self.buffer.initialize(self.design)
    ## DEF

    def testInitialize(self):
        """Check whether we can initialize the buffer properly for a design"""
        col_name = self.col_info['name']
        self.assertIsNotNone(self.buffer.collection_sizes[col_name])
        self.assertEqual(len(self.design.getIndexes(col_name)), len(self.buffer.index_sizes[col_name]))
        for indexKeys in self.design.getIndexes(col_name):
            self.assertIsNotNone(self.buffer.index_sizes[col_name][indexKeys])
        self.buffer.validate()
    ## DEF

    def testInitializePreloading(self):
        """Check whether preloading the buffer works properly"""

        num_collections = 5
        collections = dict()
        self.design = Design()
        for i in xrange(num_collections):
            col_name = "col%02d" % i
            col_info = catalog.Collection()
            col_info['name'] = col_name
            col_info['doc_count'] = NUM_DOCUMENTS
            col_info['workload_percent'] = 1 / float(num_collections)
            col_info['avg_doc_size'] = 1024
            collections[col_name] = col_info
            self.design.addCollection(col_name)
        ## FOR

        self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True)

        try:
            self.buffer.initialize(self.design)
            self.buffer.validate()
        except:
            print self.buffer
            raise
    ## DEF

    def testReset(self):
        """Check whether the LRUBuffer will reset its internal state properly"""
        self.buffer.reset()
        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
    ## DEF

    def testComputeTupleHash(self):
        num_entries = 10000
        rng = random.Random()
        rng.seed(self.__init__.im_class)
        for i in xrange(num_entries):
            # Construct a tuple and make sure that the size that we get out
            # of it is the size that we put into it
            typeId = rng.choice([LRUBuffer.DOC_TYPE_COLLECTION, LRUBuffer.DOC_TYPE_INDEX])
            key = rng.random()
            size = rng.randint(1, 8) * 1024
            documentId = rng.random()
            buffer_tuple = self.buffer.__computeTupleHash__(typeId, key, size, documentId)
            self.assertIsNotNone(buffer_tuple)

            extracted = self.buffer.__getTupleSize__(buffer_tuple)
            self.assertEqual(size, extracted, pformat(locals())) # "BufferTuple: %d / ExpectedSize: %d" % (buffer_tuple, size))
        ## FOR

    ## DEF

    def testGetDocumentFromCollection(self):
        """Check whether the LRUBuffer updates internal buffer for new collection documents"""

        documentId = 0
        pageHits = 0
        while self.buffer.remaining > self.col_info['avg_doc_size']:
            pageHits += self.buffer.getDocumentFromCollection(self.col_info['name'], documentId)
            before = self.buffer.remaining

            # If we insert the same document, we should not get any pageHits and our
            # remaining memory should be the same
            _pageHits = self.buffer.getDocumentFromCollection(self.col_info['name'], documentId)
            self.assertEqual(0, _pageHits)
            self.assertEqual(before, self.buffer.remaining)

            documentId += 1
            self.buffer.validate()
        ## WHILE

        # We should only have one pageHit per document
        self.assertEqual(documentId, pageHits)

        # Make sure that the buffer is in the right order as we evict records
        lastDocId = None
        while len(self.buffer.buffer) > 0:
            evicted = self.buffer.evictNext(self.col_info['name'])
            self.assertIsNotNone(evicted)
            self.buffer.validate()

            # We can't check this anymore because it's faster for us
            # if we just store the hash of the tuple instead of the
            # actualy tuple values
            # if lastDocId: self.assertLess(lastDocId, docId)
            # lastDocId = docId
        ## WHILE
        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
    ## DEF

    def testGetDocumentFromIndex(self):
        """Check whether the LRUBuffer updates internal buffer for new index documents"""

        # Roll through each index and add a bunch of documents. Note that the documents
        # will have the same documentId, but they should be represented as separated objects
        # in the internal buffer (because they are for different indexes)
        documentId = 0
        pageHits = 0
        while not self.buffer.evicted:
            for indexKeys in self.design.getIndexes(COLLECTION_NAME):
                pageHits += self.buffer.getDocumentFromIndex(self.col_info['name'], indexKeys, documentId)
                before = self.buffer.remaining

                # If we insert the same document, we should not get any pageHits
                _pageHits = self.buffer.getDocumentFromIndex(self.col_info['name'], indexKeys, documentId)
                self.assertEqual(0, _pageHits)
                self.assertEqual(before, self.buffer.remaining)

                if self.buffer.evicted: break
            documentId += 1
            self.buffer.validate()
        ## WHILE

        # Make sure that we get back two entries for each documentId (except for one)
        lastDocId = None
#        docIds_h = Histogram()
        while len(self.buffer.buffer) > 0:
#            typeId, key, docId = self.buffer.evictNext(COLLECTION_NAME)
            evicted = self.buffer.evictNext(COLLECTION_NAME)
            self.assertIsNotNone(evicted)
            self.buffer.validate()
#            self.assertIsNotNone(typeId)
#            self.assertIsNotNone(key)
#            self.assertIsNotNone(docId)
#            docIds_h.put(docId)
        ## WHILE

#        foundSingleDocId = False
#        for documentId,cnt in docIds_h.iteritems():
#            if cnt == 1:
#                self.assertFalse(foundSingleDocId)
#                foundSingleDocId = True
#            else:
#                self.assertEqual(2, cnt)
#        ## FOR

        self.assertEqual(BUFFER_SIZE, self.buffer.remaining)
Beispiel #46
0
    def testGuessIndex_indexChooseTheMostMatch(self):
        """
            Design with index (field01, field00), (field01),
            1. query uses index (field01) without projection field
            result: using index (field01) because they match the most
            2. query used index (field01, field00) without projection field
            result: using index (field01, field00) because they match the most

            If we have a design building indexes on (field01) only
            3. query uses index (field01, field00) without projection field
            result: using index (field01) because they match the most

            If we have a design building indexes on (field01, field03, field00), (field01)
            4. query uses index (field01, field00)
            result: using index (field01) because field01 is shorter
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field01"])

        # query 1: get query
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 2:  get query
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], 'field01')
        self.assertEqual(best_index[1], 'field00')
        self.assertFalse(covering)

        ## query 3:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field03", "field00"])
        d.addIndex("apple", ["field01"])
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)
Beispiel #47
0
    def testNetworkCostDenormalization(self):
        """Check network cost for queries that reference denormalized collections"""
        # Get the "base" design cost when all of the collections
        # are sharded on their "interesting" fields
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            if i == 0:
                d0.addShardKey(col_info['name'], col_info['interesting'])
            else:
                d0.addShardKey(col_info['name'], ["_id"])
            self.cm.invalidateCache(d0, col_info['name'])
        ## FOR
        self.cm.reset()
        self.state.reset()
        cost0 = self.cm.getCost(d0)
        
        print "cost0:", cost0
        
        # Now get the network cost for when we denormalize the
        # second collection inside of the first one
        # We should have a lower cost because there should now be fewer queries
        d1 = Design()
        for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            self.assertTrue(col_info['interesting'])
            d1.addCollection(col_info['name'])
            if i == 0:
                d1.addShardKey(col_info['name'], col_info['interesting'])
            else:
                parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
                self.assertIsNotNone(parent)
                d1.setDenormalizationParent(col_info['name'], parent['name'])
                self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name'])
                self.assertIsNotNone(d1.getDenormalizationParent(col_info['name']))
            
            self.cm.invalidateCache(d1, col_info['name'])

        combiner = WorkloadCombiner(self.collections, self.workload)
        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
        
        self.cm.reset()
        self.state.reset()
        cost1 = self.cm.getCost(d1)
        print "cost1:", cost1
       
        self.assertLess(cost1, cost0)

        # The denormalization cost should also be the same as the cost
        # when we remove all of the ops one the second collection
        backup_collections = copy.deepcopy(self.collections)

        for sess in self.state.workload:
            for op in sess["operations"]:
                if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]:
                    sess["operations"].remove(op)
            ## FOR (op)
        ## FOR (sess)
        for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)):
            del self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i]

        self.cm.reset()
        self.state.reset()
        cost2 = self.cm.getCost(d1)
        print "cost2:", cost2

        self.assertEqual(cost1, cost2)

        # Restore the original workload and see if the cost remains the same with the original one
        self.state.restoreOriginalWorkload()
        self.state.collections = backup_collections
        
        self.cm.reset()
        self.state.reset()
        cost3 = self.cm.getCost(d0)
        print "cost3:", cost3
        
        self.assertEqual(cost3, cost0)
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Beispiel #49
0
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
    def testGuessIndex_indexInIncorrectOrder(self):
        """
            Design with index (field01, field00)
            1. query uses index (field00)
            result: not using index because that query uses indexes in order
            2. query uses index (field01)
            result: using index (field01, field00) because this is the best match
            3. query uses index (field01, field00)
            result: using index (field01, field00) because they match the best

            Design with index (field00, field01)
            4. query uses index (field01, field00)
            result: using no index because the index order is not correct

            Design with index (field01, field02, field00)
            5. query uses index (field01, field00)
            result: using index (field01, field02, field00) because they match the best
            result: not cover index because the index order in design is not correct
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])

        # query 1: get query, queries on field00
        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index, None)
        self.assertFalse(covering)

        # query 2: get query, queries on field01
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 3: get query, queries on field01 and field00
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertFalse(covering)

        # query 5:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field02", "field00"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 3)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field02")
        self.assertEqual(best_index[2], "field00")
        self.assertFalse(covering)
Beispiel #51
0
class TestInitialDesigner(TPCCTestCase):
    def setUp(self):
        TPCCTestCase.setUp(self)
        self.config = configutil.makeDefaultConfig()
        self.designer = InitialDesigner(self.collections, self.workload,
                                        self.config)
        self.col_keys = self.designer.generateCollectionHistograms()
        self.design = Design()
        map(self.design.addCollection, self.col_keys.iterkeys())

    ## DEF

    def testCheckForInvalidKeys(self):
        d = self.designer.generate()
        self.assertIsNotNone(d)

        # Make sure that we don't have any invalid keys
        for col_name in d.getCollections():
            for index_keys in d.getIndexes(col_name):
                for key in index_keys:
                    assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                        "Invalid index key '%s.%s'" % (col_name, key)
                ## FOR
            for key in d.getShardKeys(col_name):
                assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                    "Invalid shard key '%s.%s'" % (col_name, key)
        ## FOR

    ## DEF

    def testSelectShardingKeys(self):
        # Select on set of keys at random and increase its occurence
        # in the histogram so that we will pick it
        expected = {}
        for col_name, h in self.col_keys.iteritems():
            keys = random.choice(h.keys())
            h.put(keys, 999999)
            expected[col_name] = keys

        self.designer.__selectShardingKeys__(self.design, self.col_keys)

        # Then check to make sure it picked what we expected it to
        for col_name in self.col_keys.iterkeys():
            shard_keys = self.design.getShardKeys(col_name)
            self.assertIsNotNone(shard_keys)
            self.assertIsInstance(shard_keys, tuple)
            self.assertEquals(expected[col_name], shard_keys)
        #print self.design

    ## DEF

    def testSelectIndexKeys(self):
        # Select on set of keys at random and increase its occurence
        # in the histogram so that we will pick it
        expected = {}
        for col_name, h in self.col_keys.iteritems():
            keys = random.choice(h.keys())
            h.put(keys, 999999)
            expected[col_name] = keys

        node_memory = self.config.get(configutil.SECT_CLUSTER, "node_memory")
        self.designer.__selectIndexKeys__(self.design, self.col_keys,
                                          node_memory)
        #print self.design

        # Then check to make sure it picked what we expected it to
        for col_name in self.col_keys.iterkeys():
            index_keys = self.design.getIndexKeys(col_name)
            self.assertIsNotNone(index_keys)
            self.assertIsInstance(index_keys, list)
    def testGuessIndex_indexChooseTheMostMatch(self):
        """
            Design with index (field01, field00), (field01),
            1. query uses index (field01) without projection field
            result: using index (field01) because they match the most
            2. query used index (field01, field00) without projection field
            result: using index (field01, field00) because they match the most

            If we have a design building indexes on (field01) only
            3. query uses index (field01, field00) without projection field
            result: using index (field01) because they match the most

            If we have a design building indexes on (field01, field03, field00), (field01)
            4. query uses index (field01, field00)
            result: using index (field01) because field01 is shorter
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field01"])

        # query 1: get query
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 2:  get query
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], 'field01')
        self.assertEqual(best_index[1], 'field00')
        self.assertFalse(covering)

        ## query 3:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field03", "field00"])
        d.addIndex("apple", ["field01"])
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)
    def getManMadeDesign(self, denorm=True):
       # create a best design mannually

        d = Design()
        d.addCollection(tpccConstants.TABLENAME_ITEM)
        d.addCollection(tpccConstants.TABLENAME_WAREHOUSE)
        d.addCollection(tpccConstants.TABLENAME_DISTRICT)
        d.addCollection(tpccConstants.TABLENAME_CUSTOMER)
        d.addCollection(tpccConstants.TABLENAME_STOCK)
        d.addCollection(tpccConstants.TABLENAME_ORDERS)
        d.addCollection(tpccConstants.TABLENAME_NEW_ORDER)
        d.addCollection(tpccConstants.TABLENAME_ORDER_LINE)

        d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"])
        d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"])
        d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"])
        d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"])

        d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"])

        return d
Beispiel #54
0
    def testNetworkCost(self):
        """Check network cost for equality predicate queries"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        self.assertTrue(col_info['interesting'])

        # If we shard the collection on the interesting fields, then
        # each query should only need to touch one node
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], col_info['interesting'])
        cost0 = self.cm.getCost(d)
        print "cost0: ", cost0

        # If we now shard the collection on just '_id', then every query
        # should have to touch every node. The cost of this design
        # should be greater than the first one
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], ['_id'])
        self.cm.invalidateCache(d, col_info['name'])
        self.state.reset()
        cost1 = self.cm.getCost(d)
        print "cost1: ", cost1

        self.assertLess(cost0, cost1)