def testDiskCostOnDifferentIndexes(self):
        """Check how indexes will affect the disk cost"""
        # 1. Put index on both of the fields seperately
        d = Design()
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field00"])
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01"])

        self.cm.reset()
        self.cm.state.reset()
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0

        # 3. Put indexes on both field together
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01", "field00"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1

        self.assertGreater(cost0, cost1)
    def testDiskCostIndexes(self):
        """Check whether disk cost calculations work correctly"""
        # First get the disk cost when there are no indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])

        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # The cost should be exactly equal to one, which means that every operation
        # has to perform a full sequential scan on the collection
        self.assertEqual(cost0, 1.0)

        # Now add the all indexes. The disk cost should be lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
        self.assertGreater(cost0, cost1)
Ejemplo n.º 3
0
    def testNetworkCost(self):
        """Check network cost for equality predicate queries"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        self.assertTrue(col_info['interesting'])

        # If we shard the collection on the interesting fields, then
        # each query should only need to touch one node
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], col_info['interesting'])
        cost0 = self.cm.getCost(d)
        print "cost0: ", cost0

        # If we now shard the collection on just '_id', then every query
        # should have to touch every node. The cost of this design
        # should be greater than the first one
        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], ['_id'])
        self.cm.invalidateCache(d, col_info['name'])
        self.state.reset()
        cost1 = self.cm.getCost(d)
        print "cost1: ", cost1

        self.assertLess(cost0, cost1)
Ejemplo n.º 4
0
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 5
0
 def testNetworkCostDenormalization(self):
     """Check network cost for queries that reference denormalized collections"""
     # Get the "base" design cost when all of the collections
     # are sharded on their "interesting" fields
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
     ## FOR
     self.cm.reset()
     self.state.reset()
     cost0 = self.cm.getCost(d)
     
     print "cost0:", cost0
     
     # Now get the network cost for when we denormalize the
     # second collection inside of the first one
     # We should have a lower cost because there should now be fewer queries
     d = Design()
     i = 0
     for col_info in self.collections.itervalues():
         self.assertTrue(col_info['interesting'])
         d.addCollection(col_info['name'])
         if i == 0:
             d.addShardKey(col_info['name'], col_info['interesting'])
         else:
             d.addShardKey(col_info['name'], ["_id"])
         self.cm.invalidateCache(d, col_info['name'])
         i += 1
         
     d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)
        
     combiner = WorkloadCombiner(self.collections, self.workload)
     combinedWorkload = combiner.process(d)
     self.state.updateWorkload(combinedWorkload)
     
     self.cm.reset()
     self.state.reset()
     cost1 = self.cm.getCost(d)
     print "cost1:", cost1
    
     self.assertLess(cost1, cost0)
Ejemplo n.º 6
0
    def testGuessIndex_indexChooseWithProjectionField(self):
        """
            If a query uses one of the indexes the design has but its projection uses
            one of the indexes the design has, we should choose the index with both
            query index and projection index
        """
        # If we have a design with index (field00), (field00, field02)
        # 1. query uses field00 but its projection field is {field02: xx}
        # result: we should choose (field00, field02) as the best index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field02"])
        d.addIndex("apple", ["field00"])

        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field02")
        self.assertTrue(covering)
Ejemplo n.º 7
0
    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1

        print "number of queries after query combination: " + str(
            number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries,
                           number_of_queries_from_combined_workload)
Ejemplo n.º 8
0
    def testInitializePreloading(self):
        """Check whether preloading the buffer works properly"""

        num_collections = 5
        collections = dict()
        self.design = Design()
        for i in xrange(num_collections):
            col_name = "col%02d" % i
            col_info = catalog.Collection()
            col_info['name'] = col_name
            col_info['doc_count'] = NUM_DOCUMENTS
            col_info['workload_percent'] = 1 / float(num_collections)
            col_info['avg_doc_size'] = 1024
            collections[col_name] = col_info
            self.design.addCollection(col_name)
        ## FOR

        self.buffer = LRUBuffer(collections, BUFFER_SIZE, preload=True)

        try:
            self.buffer.initialize(self.design)
            self.buffer.validate()
        except:
            print self.buffer
            raise
Ejemplo n.º 9
0
 def setUp(self):
     TPCCTestCase.setUp(self)
     self.config = configutil.makeDefaultConfig()
     self.designer = InitialDesigner(self.collections, self.workload,
                                     self.config)
     self.col_keys = self.designer.generateCollectionHistograms()
     self.design = Design()
     map(self.design.addCollection, self.col_keys.iterkeys())
Ejemplo n.º 10
0
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
Ejemplo n.º 11
0
    def testSameDesignExecutedTwice_withemptydesign(self):
        """
            If the same design is executed twice, they should have the same result
        """
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)

        ## for
        cost0 = self.cm.overallCost(d)
        cost1 = self.cm.overallCost(d)

        self.assertEqual(cost0, cost1)
Ejemplo n.º 12
0
    def testNotCollectionEmbeddingProcessShouldReturnNone(self):
        """
            If the given design has no collection embedding, we should return right away
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        combinedWorkload = combiner.process(d0)
        self.assertEqual(None, combinedWorkload)
Ejemplo n.º 13
0
    def testDenormalizer(self):
        d = Design()
        for col_name in self.col_names:
            d.addCollection(col_name)
        ## FOR
        op_list = self.printOperations()
        col_list = self.printAllCollections()
        d.setDenormalizationParent("koalas", "apples")

        dn = Denormalizer(self.metadata_db, self.dataset_db, d)
        dn.process()

        new_op_list = self.printOperations()
        new_col_list = self.printAllCollections()

        self.assertTrue("koalas" not in new_op_list)
        self.assertTrue("koalas" not in new_col_list)
    def testDiskCostCaching(self):
        """Check whether disk cost calculations work correctly with caching enabled"""
        self.cm.cache_enable = True

        # Give the mofo a full Design with indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
            ## FOR
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # FIXME self.assertGreater(cost0, 0.0)

        # We should get the same cost back after we execute it a second time
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
    def testSkewCost(self):
        """Check whether skew cost calculations work correctly"""
        col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        # First get the skew cost when the queries got each node uniformly
        # This is the best-case scenario
        op_ctr = 0
        for sess in self.workload:
            for op in sess['operations']:
                query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                           {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\
                } ]
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
                op_ctr += 1
            ## FOR (op)
        ## FOR (session)
        cost0 = self.cm.getCost(d)
        self.assertLessEqual(cost0, 1.0)
        #        print "skewCost0:", cost0

        # Then make all of the operations go to a single node
        # This is the worst-case scenario
        query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\
                                   {shard_key: 1000l }\
        } ]
        for sess in self.workload:
            for op in sess['operations']:
                op['collection'] = col_info['name']
                op['query_content'] = query_content
                op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY }
            ## FOR
        self.state.reset()
        self.cm.reset()
        cost1 = self.cm.getCost(d)
        self.assertLessEqual(cost1, 1.0)
        #        print "skewCost1:", cost1

        self.assertGreater(cost1, cost0)
Ejemplo n.º 16
0
    def testEstimateNodesNullValue(self):
        """Check the estimating touched nodes when the sharding key value is null"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # This key won't be in the operation's fields, but we should still
            # be able to get back a value
            d.addShardKey(col_info['name'], ['XXXX'])
            ## FOR

        # A query that looks up on a non-sharding key should always be
        # broadcast to every node
        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        touched0 = list(self.estimator.estimateNodes(d, op))
        #        print "touched0:", touched0
        self.assertListEqual(range(NUM_NODES), touched0)

        # But if we insert into that collection with a document that doesn't
        # have the sharding key, it should only go to one node
        op['type'] = constants.OP_TYPE_INSERT
        op['query_content'] = op['resp_content']
        op['predicates'] = []
        #        pprint(op)
        touched1 = list(self.estimator.estimateNodes(d, op))
        #        print "touched1:", touched1
        self.assertEqual(1, len(touched1))

        # And if we insert another one, then we should get the same value back
        op = Session.operationFactory()
        op['collection'] = COLLECTION_NAMES[0]
        op['type'] = constants.OP_TYPE_INSERT
        op['query_id'] = 10000
        op['query_content'] = [{"parkinglot": 1234}]
        op['resp_content'] = [{"ok": 1}]
        op['resp_id'] = 10001
        #        pprint(op)
        touched2 = list(self.estimator.estimateNodes(d, op))
        self.assertEqual(1, len(touched2))
        self.assertListEqual(touched1, touched2)
Ejemplo n.º 17
0
    def testEstimateNodesEquality(self):
        """Check the estimating touched nodes for a equality predicate op"""

        d = Design()
        for i in xrange(0, len(COLLECTION_NAMES)):
            col_info = self.collections[COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])
            # Only put the first field in the interesting list as the sharding key
            # We'll worry about compound sharding keys later.
            d.addShardKey(col_info['name'], col_info['interesting'][:1])
        ## FOR

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        #        pprint(op)

        # If we execute it twice, we should get back the exact same node ids
        touched0 = list(self.estimator.estimateNodes(d, op))
        touched1 = list(self.estimator.estimateNodes(d, op))
        self.assertListEqual(touched0, touched1)
Ejemplo n.º 18
0
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS + 1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'],
                             self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info},
                                BUFFER_SIZE)
        self.buffer.initialize(self.design)
Ejemplo n.º 19
0
    def testGuessIndex_IndexSizeEstimation(self):
        """
            Check if the size of the indexes vary
        """
        d = Design()
        d.addCollection("apple")

        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])
        d.addIndex("apple", ["field00", "field01"])

        # op0 use index (field00)
        op0 = self.ops[0]

        # op1 use index (field01)
        op1 = self.ops[1]

        # op2 use index (field01, field00)
        op2 = self.ops[2]

        # op3 use index (field00, field01)
        op3 = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op0)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op1)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op2)
        self.assertEqual(24 + 24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op3)
        self.assertEqual(24 + 24 + 8, index_size)
Ejemplo n.º 20
0
    def testGuessIndex_IndexSizeEstimation_Denormalization(self):
        """
            If collection A is denormalized into B, then the index for collection B should have larger size now
            (If and only if the index is built on a field that is included by both collection A and collection B)
        """
        d = Design()
        d.addCollection("apple")
        d.addCollection("microsoft")
        d.addCollection("google")

        d.addIndex("apple", ["field00"])
        d.addIndex("microsoft", ["field00"])
        d.addIndex("google", ["field00"])

        # op4 use index (field00) but it only goes to collection microsoft
        op4 = self.ops[4]

        # Guess index

        # Without denormalization
        best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(
            d, op4)

        # With one denormalization
        d.setDenormalizationParent("apple", "microsoft")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_1, index_size_0)

        # With chained denormalization
        self.cm.reset()
        d.setDenormalizationParent("google", "apple")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_2, index_size_1)
Ejemplo n.º 21
0
    def testEstimateNodesRange(self):
        """Check the estimating touched nodes for a range predicate op"""

        col_info = self.collections[COLLECTION_NAMES[0]]
        shard_key = col_info['interesting'][0]
        col_info['fields'][shard_key]['selectivity'] = 0.5

        d = Design()
        d.addCollection(col_info['name'])
        d.addShardKey(col_info['name'], [shard_key])

        sess = self.metadata_db.Session.fetch_one()
        op = sess['operations'][0]
        op['query_content'] = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": \
                {shard_key: {constants.REPLACE_KEY_DOLLAR_PREFIX+"gt": 10000l} } \
        } ]
        op['predicates'] = {shard_key: constants.PRED_TYPE_RANGE}

        # The list estimated touched nodes should contain more than one entry
        touched0 = list(self.estimator.estimateNodes(d, op))
        print "touched0:", touched0
        self.assertGreater(len(touched0), 1)
    def getManMadeDesign(self, denorm=True):
        # create a best design mannually

        d = Design()
        d.addCollection(tpccConstants.TABLENAME_ITEM)
        d.addCollection(tpccConstants.TABLENAME_WAREHOUSE)
        d.addCollection(tpccConstants.TABLENAME_DISTRICT)
        d.addCollection(tpccConstants.TABLENAME_CUSTOMER)
        d.addCollection(tpccConstants.TABLENAME_STOCK)
        d.addCollection(tpccConstants.TABLENAME_ORDERS)
        d.addCollection(tpccConstants.TABLENAME_NEW_ORDER)
        d.addCollection(tpccConstants.TABLENAME_ORDER_LINE)

        d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"])
        d.addIndex(tpccConstants.TABLENAME_CUSTOMER,
                   ["C_W_ID", "C_D_ID", "C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_C_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDERS,
                   ["O_W_ID", "O_D_ID", "O_ID"])
        d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"])
        d.addIndex(tpccConstants.TABLENAME_NEW_ORDER,
                   ["NO_W_ID", "NO_D_ID", "NO_O_ID"])
        d.addIndex(tpccConstants.TABLENAME_ORDER_LINE,
                   ["OL_W_ID", "OL_D_ID", "OL_O_ID"])

        d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"])
        d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"])
        d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"])

        return d
Ejemplo n.º 23
0
    def testGuessIndex_consistentAnswer(self):
        """Check that guessIndex always returns the same answer for the same input"""

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])

        for i in xrange(len(self.ops) - 2):
            op = self.ops[i]
            last_index, last_covering = (None, None)
            for i in xrange(100):
                best_index, covering, index_size, slot_size = self.cm.guess_op_info(
                    d, op)
                self.assertIsNotNone(best_index)
                self.assertIsNotNone(covering)
                if not last_index is None:
                    self.assertEqual(last_index, best_index)
                    self.assertEqual(last_covering, covering)
                last_index, last_covering = (best_index, covering)
Ejemplo n.º 24
0
    def testGuessIndex_indexChooseWithoutProjectionField(self):
        """
            If a query uses all the indexes but doesn't have a projection field,
            we still think it is not a covering index
        """
        # If we have a design with indexes(field00, field01)
        # 1. query uses (field00, field01) but there is no projection field
        # result: we should choose (field00, field02) but the index is not a covering index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field01")
        self.assertFalse(covering)
Ejemplo n.º 25
0
    def testGuessIndex_indexInIncorrectOrder(self):
        """
            Design with index (field01, field00)
            1. query uses index (field00)
            result: not using index because that query uses indexes in order
            2. query uses index (field01)
            result: using index (field01, field00) because this is the best match
            3. query uses index (field01, field00)
            result: using index (field01, field00) because they match the best

            Design with index (field00, field01)
            4. query uses index (field01, field00)
            result: using no index because the index order is not correct

            Design with index (field01, field02, field00)
            5. query uses index (field01, field00)
            result: using index (field01, field02, field00) because they match the best
            result: not cover index because the index order in design is not correct
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])

        # query 1: get query, queries on field00
        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index, None)
        self.assertFalse(covering)

        # query 2: get query, queries on field01
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 3: get query, queries on field01 and field00
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertFalse(covering)

        # query 5:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field02", "field00"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 3)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field02")
        self.assertEqual(best_index[2], "field00")
        self.assertFalse(covering)
Ejemplo n.º 26
0
    def testNetworkCostDenormalization(self):
        """Check network cost for queries that reference denormalized collections"""
        # Get the "base" design cost when all of the collections
        # are sharded on their "interesting" fields
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            if i == 0:
                d0.addShardKey(col_info['name'], col_info['interesting'])
            else:
                d0.addShardKey(col_info['name'], ["_id"])
            self.cm.invalidateCache(d0, col_info['name'])
        ## FOR
        self.cm.reset()
        self.state.reset()
        cost0 = self.cm.getCost(d0)
        
        print "cost0:", cost0
        
        # Now get the network cost for when we denormalize the
        # second collection inside of the first one
        # We should have a lower cost because there should now be fewer queries
        d1 = Design()
        for i in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            self.assertTrue(col_info['interesting'])
            d1.addCollection(col_info['name'])
            if i == 0:
                d1.addShardKey(col_info['name'], col_info['interesting'])
            else:
                parent = self.collections[CostModelTestCase.COLLECTION_NAMES[0]]
                self.assertIsNotNone(parent)
                d1.setDenormalizationParent(col_info['name'], parent['name'])
                self.assertTrue(d1.isDenormalized(col_info['name']), col_info['name'])
                self.assertIsNotNone(d1.getDenormalizationParent(col_info['name']))
            
            self.cm.invalidateCache(d1, col_info['name'])

        combiner = WorkloadCombiner(self.collections, self.workload)
        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
        
        self.cm.reset()
        self.state.reset()
        cost1 = self.cm.getCost(d1)
        print "cost1:", cost1
       
        self.assertLess(cost1, cost0)

        # The denormalization cost should also be the same as the cost
        # when we remove all of the ops one the second collection
        backup_collections = copy.deepcopy(self.collections)

        for sess in self.state.workload:
            for op in sess["operations"]:
                if op["collection"] <> CostModelTestCase.COLLECTION_NAMES[0]:
                    sess["operations"].remove(op)
            ## FOR (op)
        ## FOR (sess)
        for i in xrange(1, len(CostModelTestCase.COLLECTION_NAMES)):
            del self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            print "deleted name: ", CostModelTestCase.COLLECTION_NAMES[i]

        self.cm.reset()
        self.state.reset()
        cost2 = self.cm.getCost(d1)
        print "cost2:", cost2

        self.assertEqual(cost1, cost2)

        # Restore the original workload and see if the cost remains the same with the original one
        self.state.restoreOriginalWorkload()
        self.state.collections = backup_collections
        
        self.cm.reset()
        self.state.reset()
        cost3 = self.cm.getCost(d0)
        print "cost3:", cost3
        
        self.assertEqual(cost3, cost0)
Ejemplo n.º 27
0
    def testGuessIndex_indexChooseTheMostMatch(self):
        """
            Design with index (field01, field00), (field01),
            1. query uses index (field01) without projection field
            result: using index (field01) because they match the most
            2. query used index (field01, field00) without projection field
            result: using index (field01, field00) because they match the most

            If we have a design building indexes on (field01) only
            3. query uses index (field01, field00) without projection field
            result: using index (field01) because they match the most

            If we have a design building indexes on (field01, field03, field00), (field01)
            4. query uses index (field01, field00)
            result: using index (field01) because field01 is shorter
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field01"])

        # query 1: get query
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 2:  get query
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], 'field01')
        self.assertEqual(best_index[1], 'field00')
        self.assertFalse(covering)

        ## query 3:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field03", "field00"])
        d.addIndex("apple", ["field01"])
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)