コード例 #1
    def testfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk':    1,
            'weight_skew':    1,
            'nodes':          10,
            'max_memory':     1024,
            'skew_intervals': 10,
            'address_size':   64,
            'window_size':    500
        cm = CostModel(self.collections, self.workload, cmConfig)
        d0 = self.getManMadeDesign()
        print d0
        output_design = d0.toJSON() 
        cost0 = cm.overallCost(d0)
        ds = Deserializer(output_design)
        d1 = ds.Deserialize()
        print d1
        cost1 = cm.overallCost(d1)

        self.assertEqual(cost1, cost0)
コード例 #2
    def testfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk': 1,
            'weight_skew': 1,
            'nodes': 10,
            'max_memory': 1024,
            'skew_intervals': 10,
            'address_size': 64,
            'window_size': 500
        cm = CostModel(self.collections, self.workload, cmConfig)
        d0 = self.getManMadeDesign()
        print d0
        output_design = d0.toJSON()
        cost0 = cm.overallCost(d0)
        ds = Deserializer(output_design)
        d1 = ds.Deserialize()
        print d1
        cost1 = cm.overallCost(d1)

        self.assertEqual(cost1, cost0)
コード例 #3
    def outtestfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk':    1,
            'weight_skew':    1,
            'nodes':          10,
            'max_memory':     1024,
            'skew_intervals': 10,
            'address_size':   64,
            'window_size':    500
        cm = CostModel(self.collections, self.workload, cmConfig)

        initialDesign = InitialDesigner(self.collections, self.workload, None).generate()
        upper_bound = cm.overallCost(initialDesign)
        print "init solution: ", initialDesign
        print "init solution cost: ", upper_bound
        collectionNames = [c for c in self.collections]
        dc = self.dc.getCandidates(collectionNames)
        print "candidates: ", dc
        ln = LNSDesigner(self.collections, \
                        self.dc, \
                        self.workload, \
                        None, \
                        cm, \
                        initialDesign, \
                        upper_bound, \
        solution = ln.solve()
        print "Best cost: ", ln.bestCost
        print "solution: ", solution
コード例 #4
    def testfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk':    1,
            'weight_skew':    1,
            'nodes':          10,
            'max_memory':     1024,
            'skew_intervals': 10,
            'address_size':   64,
            'window_size':    500
        cm = CostModel(self.collections, self.workload, cmConfig)
        d0 = self.getManMadeDesign()
        cost0 = cm.overallCost(d0)

        d1 = d0.copy()
        d1.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)
        cost1 = cm.overallCost(d1)

        self.assertLess(cost1, cost0)
コード例 #5
    def testfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk': 1,
            'weight_skew': 1,
            'nodes': 10,
            'max_memory': 1024,
            'skew_intervals': 10,
            'address_size': 64,
            'window_size': 500
        cm = CostModel(self.collections, self.workload, cmConfig)
        d0 = self.getManMadeDesign()
        cost0 = cm.overallCost(d0)

        d1 = d0.copy()
        cost1 = cm.overallCost(d1)

        self.assertLess(cost1, cost0)
コード例 #6
ファイル: designer.py プロジェクト: greinerb/mongodb-d4
class Designer():

    def __init__(self, config, metadata_db, dataset_db, channel=None):
        # SafeConfigParser
        self.config = config

        # The metadata database will contain:
        #   (1) Collection catalog
        #   (2) Workload sessions
        #   (3) Workload stats
        self.metadata_db = metadata_db

        # The dataset database will contain a reconstructed
        # invocation of the database.
        # We need this because the main script will need to
        # compute whatever stuff that it needs
        self.dataset_db = dataset_db

        self.initialSolution = None
        self.finalSolution = None

        # self.page_size = self.config.getint(configutil.SECT_CLUSTER, 'page_size')
        self.page_size = constants.DEFAULT_PAGE_SIZE
        self.sample_rate = self.config.getint(configutil.SECT_DESIGNER, 'sample_rate')

        self.sess_limit = None
        self.op_limit = None

        # Used for multithread
        self.channel = channel
        self.search_method = None
        self.designCandidates = None
        self.collections = None
        self.cm = None
        self.workload = None
        self.debug = LOG.isEnabledFor(logging.DEBUG)
    ## DEF

    def setOptionsFromArguments(self, args):
        """Set the internal parameters of the Designer based on command-line arguments"""

        skip = set(["config", "metadata_db", "dataset_db"])
        for key in args:
            if key in skip: continue
            if self.debug: LOG.debug("%s => %s" % (key, args[key]))
            self.__dict__[key] = args[key]
        ## FOR
        if self.debug: LOG.setLevel(logging.DEBUG)
    ## DEF

    def getCollectionCatalog(self):
        """Return a dict of collection catalog objects"""
        collectionStats = { }
        for stats in self.metadata_db[constants.COLLECTION_SCHEMA].find():
            collectionStats[stats.name] = stats
        return collectionStats
    ## DEF

    ## -------------------------------------------------------------------------
    ## -------------------------------------------------------------------------

    def processMongoInput(self, fd, no_load=False, no_post_process=False):
        import inputs.mongodb

        # MongoDB Trace
        converter = inputs.mongodb.MongoSniffConverter(
        converter.stop_on_error = self.stop_on_error
        converter.no_mongo_parse = self.no_mongo_parse
        converter.no_mongo_reconstruct = self.no_mongo_reconstruct
        converter.no_mongo_sessionizer = self.no_mongo_sessionizer
        converter.no_mongo_aggregate_fix = self.no_mongo_aggregate_fix
        converter.no_mongo_normalize = self.no_mongo_normalize
        converter.no_mongo_dependencies = self.no_mongo_dependencies
        converter.random_sessionizer = self.random_sessionizer
        converter.mongo_skip = self.mongo_skip
        converter.sess_limit = self.sess_limit
        converter.op_limit = self.op_limit

    ## DEF

    def processMySQLInput(self, no_load=False, no_post_process=False):
        from inputs.mysql import MySQLConverter

        # MySQL Trace
        converter = MySQLConverter(
            dbHost=self.config.get(configutil.SECT_MYSQL, 'host'),
            dbPort=self.config.getint(configutil.SECT_MYSQL, 'port'),
            dbName=self.config.get(configutil.SECT_MYSQL, 'name'),
            dbUser=self.config.get(configutil.SECT_MYSQL, 'user'),
            dbPass=self.config.get(configutil.SECT_MYSQL, 'pass'))

        converter.no_mysql_schema = self.no_mysql_schema
        converter.no_mysql_workload = self.no_mysql_workload
        converter.no_mysql_dataset = self.no_mysql_dataset
        converter.sess_limit = self.sess_limit
        converter.op_limit = self.op_limit
        # Process the inputs and then save the results in mongodb
    ## DEF

    def generateDesignCandidates(self, collections, isShardingEnabled=True, isIndexesEnabled=True, isDenormalizationEnabled=True):

        dc = DesignCandidates()
        valid_collection = set()
        for col_info in collections.itervalues():

            shardKeys = []
            indexKeys = []
            denorm = []

            interesting = col_info['interesting']
            interesting = self.__remove_heuristicaly_bad_key__(col_info, interesting)
            # Make sure that none of our interesting fields start with
            # the character that we used to convert $ commands
            for key in interesting:
                assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                    "Unexpected candidate key '%s.%s'" % (col_info["name"], key)

            if constants.SKIP_MONGODB_ID_FIELD and "_id" in interesting:
                interesting = interesting[:]

            # deal with shards
            if isShardingEnabled:
                LOG.debug("Sharding is enabled")
                shardKeys = interesting

            # deal with indexes
            if isIndexesEnabled:
                LOG.debug("Indexes is enabled")
                for o in xrange(1, len(interesting) + 1) :
                    if o > constants.MAX_INDEX_SIZE: break
                    for i in itertools.permutations(interesting, o):
                    ## FOR
                ## FOR
            # deal with de-normalization
            if len(indexKeys) > 10:
                LOG.warn("Too many index keys: %s", len(indexKeys))
            if isDenormalizationEnabled:
                LOG.debug("Denormalization is enabled")
                for k,v in col_info['fields'].iteritems() :
                    if v['parent_col'] <> None and v['parent_col'] not in denorm and v['parent_col'] in valid_collection:
            dc.addCollection(col_info['name'], indexKeys, shardKeys, denorm)
            ## FOR

        return dc

    def __remove_heuristicaly_bad_key__(self, col_info, keys):
        res = keys[:]
        key_selectivtiy = []
        for key in keys:
            key_selectivtiy.append((col_info['fields'][key]['selectivity'], key))
            if col_info['fields'][key]['selectivity'] < constants.MIN_SELECTIVITY or \
            (col_info['fields'][key]['selectivity'] >= constants.MIN_SELECTIVITY and col_info['fields'][key]['cardinality'] < 3):
            ## IF
        ## FOR
        if len(res) == 0:
            sorted_res = sorted(key_selectivtiy, reverse=True)
            sorted_key = [x[1] for x in sorted_res]
            res = sorted_key[:constants.NUMBER_OF_BACKUP_KEYS]

        return res
    ## DEF

    def loadCollections(self):
        collections = dict()
        for col_info in self.metadata_db.Collection.fetch():
            # Skip any collection that doesn't have any documents in it
            # This is because we won't be able to make any estimates about how
            # big the collection actually is
            if not col_info['doc_count'] or not col_info['avg_doc_size'] or len(col_info['interesting']) == 0 or col_info['workload_queries'] == 0:
            collections[col_info['name']] = col_info
        ## FOR
        if not collections:
            raise Exception("No collections were found in metadata catalog")
        LOG.info("Loaded %d collections from metadata catalog" % len(collections))

        return collections
    ## DEF

    def loadWorkload(self, collections):
        # We want to bring down all of the sessions that we are going to use to compute the
        # cost of each design
        workload = [ ]
        workloadQuery = {"operations.collection": {"$in": collections.keys()}}
        op_ctr = 0
        cursor = self.metadata_db.Session.fetch(workloadQuery)
        if not self.sess_limit is None:
            assert self.sess_limit >= 0
        for sess in cursor:
            if not self.op_limit is None and op_ctr >= self.op_limit:
            op_ctr += len(sess['operations'])
        ## FOR
        if not len(workload):
            raise Exception("No workload sessions were found in database\n%s" % pformat(workloadQuery))
        LOG.info("Loaded %d sessions with %d operations from workload database", len(workload), op_ctr)
        return workload
    ## DEF

    ## -------------------------------------------------------------------------
    ## -------------------------------------------------------------------------

    # the replay flag and replay_design is used to re-evalutated the design read from a design file
    # This is very ugly...but we don't have time now...
    def load(self, replay=False, replay_design=None, init=False):
        """Perform the actual search for a design"""
        isShardingEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_sharding')
        isIndexesEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_indexes')
        isDenormalizationEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_denormalization')

        self.collections = self.loadCollections()
        self.workload = self.loadWorkload(self.collections)
        # Generate all the design candidates
        self.designCandidates = self.generateDesignCandidates(self.collections, isShardingEnabled, isIndexesEnabled, isDenormalizationEnabled)
        #LOG.info("candidates: %s\n", self.designCandidates)
        # Instantiate cost model
        cmConfig = {
            'weight_network': self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_network'),
            'weight_disk':    self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_disk'),
            'weight_skew':    self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_skew'),
            'nodes':          self.config.getint(configutil.SECT_CLUSTER, 'nodes'),
            'max_memory':     self.config.getint(configutil.SECT_CLUSTER, 'node_memory'),
            'skew_intervals': self.config.getint(configutil.SECT_COSTMODEL, 'time_intervals'),
            'address_size':   self.config.getint(configutil.SECT_COSTMODEL, 'address_size'),
            'window_size':    self.config.getint(configutil.SECT_COSTMODEL, 'window_size')
        self.cm = CostModel(self.collections, self.workload, cmConfig)
#        if self.debug:
#            state.debug = True
#            costmodel.LOG.setLevel(logging.DEBUG)

        # Compute initial solution and calculate its cost
        # This will be the upper bound from starting design
        if not replay:
            initialDesign = InitialDesigner(self.collections, self.workload, self.config).generate()
            if init:
                print initialDesign.toJSON()
            #import pycallgraph
            initialCost = self.cm.overallCost(initialDesign)
            return initialCost, initialDesign
            return None
    ## DEF
    def search(self, initialCost, initialDesign, worker_id):
            Main search process starts here
        lock = thread.allocate_lock()
        self.search_method = LNSDesigner(self.collections, self.designCandidates, self.workload, self.config, self.cm, initialDesign, initialCost, self.channel, lock, worker_id)
    ## DEF