Ejemplo n.º 1
0
    def outtestfindExpectedDesign(self):
        """Perform the actual search for a design"""
        # Generate all the design candidates
        # Instantiate cost model
        cmConfig = {
            'weight_network': 4,
            'weight_disk':    1,
            'weight_skew':    1,
            'nodes':          10,
            'max_memory':     1024,
            'skew_intervals': 10,
            'address_size':   64,
            'window_size':    500
        }
        cm = CostModel(self.collections, self.workload, cmConfig)

        initialDesign = InitialDesigner(self.collections, self.workload, None).generate()
        upper_bound = cm.overallCost(initialDesign)
        print "init solution: ", initialDesign
        print "init solution cost: ", upper_bound
        collectionNames = [c for c in self.collections]
        
        dc = self.dc.getCandidates(collectionNames)
        print "candidates: ", dc
        ln = LNSDesigner(self.collections, \
                        self.dc, \
                        self.workload, \
                        None, \
                        cm, \
                        initialDesign, \
                        upper_bound, \
                        LNS_RUN_TIME)
        solution = ln.solve()
        print "Best cost: ", ln.bestCost
        print "solution: ", solution
Ejemplo n.º 2
0
    def search(self, initialCost, initialDesign, worker_id):
        """
            Main search process starts here
        """
        lock = thread.allocate_lock()
        self.search_method = LNSDesigner(self.collections, self.designCandidates, self.workload, self.config, self.cm, initialDesign, initialCost, self.channel, lock, worker_id)
        self.search_method.start()
    ## DEF

## CLASS
Ejemplo n.º 3
0
class Designer():

    def __init__(self, config, metadata_db, dataset_db, channel=None):
        # SafeConfigParser
        self.config = config

        # The metadata database will contain:
        #   (1) Collection catalog
        #   (2) Workload sessions
        #   (3) Workload stats
        self.metadata_db = metadata_db

        # The dataset database will contain a reconstructed
        # invocation of the database.
        # We need this because the main script will need to
        # compute whatever stuff that it needs
        self.dataset_db = dataset_db

        self.initialSolution = None
        self.finalSolution = None

        # self.page_size = self.config.getint(configutil.SECT_CLUSTER, 'page_size')
        self.page_size = constants.DEFAULT_PAGE_SIZE
        self.sample_rate = self.config.getint(configutil.SECT_DESIGNER, 'sample_rate')

        self.sess_limit = None
        self.op_limit = None

        # Used for multithread
        self.channel = channel
        self.search_method = None
        self.designCandidates = None
        self.collections = None
        self.cm = None
        self.workload = None
        
        self.debug = LOG.isEnabledFor(logging.DEBUG)
    ## DEF

    def setOptionsFromArguments(self, args):
        """Set the internal parameters of the Designer based on command-line arguments"""

        # HACK HACK HACK HACK
        skip = set(["config", "metadata_db", "dataset_db"])
        for key in args:
            if key in skip: continue
            if self.debug: LOG.debug("%s => %s" % (key, args[key]))
            self.__dict__[key] = args[key]
        ## FOR
        if self.debug: LOG.setLevel(logging.DEBUG)
    ## DEF

    def getCollectionCatalog(self):
        """Return a dict of collection catalog objects"""
        collectionStats = { }
        for stats in self.metadata_db[constants.COLLECTION_SCHEMA].find():
            collectionStats[stats.name] = stats
        return collectionStats
    ## DEF

    ## -------------------------------------------------------------------------
    ## INPUT PROCESSING
    ## -------------------------------------------------------------------------

    def processMongoInput(self, fd, no_load=False, no_post_process=False):
        import inputs.mongodb

        # MongoDB Trace
        converter = inputs.mongodb.MongoSniffConverter(
            self.metadata_db,
            self.dataset_db,
            fd
        )
        converter.stop_on_error = self.stop_on_error
        converter.no_mongo_parse = self.no_mongo_parse
        converter.no_mongo_reconstruct = self.no_mongo_reconstruct
        converter.no_mongo_sessionizer = self.no_mongo_sessionizer
        converter.no_mongo_aggregate_fix = self.no_mongo_aggregate_fix
        converter.no_mongo_normalize = self.no_mongo_normalize
        converter.no_mongo_dependencies = self.no_mongo_dependencies
        converter.random_sessionizer = self.random_sessionizer
        converter.mongo_skip = self.mongo_skip
        converter.sess_limit = self.sess_limit
        converter.op_limit = self.op_limit

        converter.process(
            no_load=no_load,
            no_post_process=no_post_process,
            page_size=self.page_size,
        )
    ## DEF

    def processMySQLInput(self, no_load=False, no_post_process=False):
        from inputs.mysql import MySQLConverter

        # MySQL Trace
        converter = MySQLConverter(
            self.metadata_db,
            self.dataset_db,
            dbHost=self.config.get(configutil.SECT_MYSQL, 'host'),
            dbPort=self.config.getint(configutil.SECT_MYSQL, 'port'),
            dbName=self.config.get(configutil.SECT_MYSQL, 'name'),
            dbUser=self.config.get(configutil.SECT_MYSQL, 'user'),
            dbPass=self.config.get(configutil.SECT_MYSQL, 'pass'))

        converter.no_mysql_schema = self.no_mysql_schema
        converter.no_mysql_workload = self.no_mysql_workload
        converter.no_mysql_dataset = self.no_mysql_dataset
        converter.sess_limit = self.sess_limit
        converter.op_limit = self.op_limit
        
        # Process the inputs and then save the results in mongodb
        converter.process(
            no_load=no_load,
            no_post_process=no_post_process,
            page_size=self.page_size,
        )
    ## DEF

    def generateDesignCandidates(self, collections, isShardingEnabled=True, isIndexesEnabled=True, isDenormalizationEnabled=True):

        dc = DesignCandidates()
        valid_collection = set()
        for col_info in collections.itervalues():

            shardKeys = []
            indexKeys = []
            denorm = []

            interesting = col_info['interesting']
            valid_collection.add(col_info['name'])
            
            interesting = self.__remove_heuristicaly_bad_key__(col_info, interesting)
            # Make sure that none of our interesting fields start with
            # the character that we used to convert $ commands
            for key in interesting:
                assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \
                    "Unexpected candidate key '%s.%s'" % (col_info["name"], key)

            if constants.SKIP_MONGODB_ID_FIELD and "_id" in interesting:
                interesting = interesting[:]
                interesting.remove("_id")

            # deal with shards
            if isShardingEnabled:
                LOG.debug("Sharding is enabled")
                shardKeys = interesting

            # deal with indexes
            if isIndexesEnabled:
                LOG.debug("Indexes is enabled")
                for o in xrange(1, len(interesting) + 1) :
                    if o > constants.MAX_INDEX_SIZE: break
                    for i in itertools.permutations(interesting, o):
                        indexKeys.append(i)
                    ## FOR
                ## FOR
            # deal with de-normalization
            if len(indexKeys) > 10:
                LOG.warn("Too many index keys: %s", len(indexKeys))
            if isDenormalizationEnabled:
                LOG.debug("Denormalization is enabled")
                for k,v in col_info['fields'].iteritems() :
                    if v['parent_col'] <> None and v['parent_col'] not in denorm and v['parent_col'] in valid_collection:
                        denorm.append(v['parent_col'])
            
            dc.addCollection(col_info['name'], indexKeys, shardKeys, denorm)
            ## FOR

        return dc

    def __remove_heuristicaly_bad_key__(self, col_info, keys):
        res = keys[:]
        key_selectivtiy = []
        for key in keys:
            key_selectivtiy.append((col_info['fields'][key]['selectivity'], key))
            if col_info['fields'][key]['selectivity'] < constants.MIN_SELECTIVITY or \
            (col_info['fields'][key]['selectivity'] >= constants.MIN_SELECTIVITY and col_info['fields'][key]['cardinality'] < 3):
                res.remove(key)
            ## IF
        ## FOR
        if len(res) == 0:
            sorted_res = sorted(key_selectivtiy, reverse=True)
            sorted_key = [x[1] for x in sorted_res]
            res = sorted_key[:constants.NUMBER_OF_BACKUP_KEYS]

        return res
    ## DEF

    def loadCollections(self):
        collections = dict()
        for col_info in self.metadata_db.Collection.fetch():
            # Skip any collection that doesn't have any documents in it
            # This is because we won't be able to make any estimates about how
            # big the collection actually is
            if not col_info['doc_count'] or not col_info['avg_doc_size'] or len(col_info['interesting']) == 0 or col_info['workload_queries'] == 0:
                continue
            collections[col_info['name']] = col_info
        ## FOR
        if not collections:
            raise Exception("No collections were found in metadata catalog")
        LOG.info("Loaded %d collections from metadata catalog" % len(collections))

        return collections
    ## DEF

    def loadWorkload(self, collections):
        # We want to bring down all of the sessions that we are going to use to compute the
        # cost of each design
        workload = [ ]
        workloadQuery = {"operations.collection": {"$in": collections.keys()}}
        op_ctr = 0
        cursor = self.metadata_db.Session.fetch(workloadQuery)
        if not self.sess_limit is None:
            assert self.sess_limit >= 0
            cursor.limit(self.sess_limit)
        for sess in cursor:
            if not self.op_limit is None and op_ctr >= self.op_limit:
                break
            workload.append(sess)
            op_ctr += len(sess['operations'])
        ## FOR
        if not len(workload):
            raise Exception("No workload sessions were found in database\n%s" % pformat(workloadQuery))
        LOG.info("Loaded %d sessions with %d operations from workload database", len(workload), op_ctr)
        return workload
    ## DEF

    ## -------------------------------------------------------------------------
    ## DESIGNER EXECUTION
    ## -------------------------------------------------------------------------

    ## HACK HACK HACK
    # the replay flag and replay_design is used to re-evalutated the design read from a design file
    # This is very ugly...but we don't have time now...
    def load(self, replay=False, replay_design=None, init=False):
        """Perform the actual search for a design"""
        isShardingEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_sharding')
        isIndexesEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_indexes')
        isDenormalizationEnabled = self.config.getboolean(configutil.SECT_DESIGNER, 'enable_denormalization')

        self.collections = self.loadCollections()
        self.workload = self.loadWorkload(self.collections)
        # Generate all the design candidates
        self.designCandidates = self.generateDesignCandidates(self.collections, isShardingEnabled, isIndexesEnabled, isDenormalizationEnabled)
        #LOG.info("candidates: %s\n", self.designCandidates)
        # Instantiate cost model
        cmConfig = {
            'weight_network': self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_network'),
            'weight_disk':    self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_disk'),
            'weight_skew':    self.config.getfloat(configutil.SECT_COSTMODEL, 'weight_skew'),
            'nodes':          self.config.getint(configutil.SECT_CLUSTER, 'nodes'),
            'max_memory':     self.config.getint(configutil.SECT_CLUSTER, 'node_memory'),
            'skew_intervals': self.config.getint(configutil.SECT_COSTMODEL, 'time_intervals'),
            'address_size':   self.config.getint(configutil.SECT_COSTMODEL, 'address_size'),
            'window_size':    self.config.getint(configutil.SECT_COSTMODEL, 'window_size')
        }
        self.cm = CostModel(self.collections, self.workload, cmConfig)
#        if self.debug:
#            state.debug = True
#            costmodel.LOG.setLevel(logging.DEBUG)

        # Compute initial solution and calculate its cost
        # This will be the upper bound from starting design
        
        if not replay:
            initialDesign = InitialDesigner(self.collections, self.workload, self.config).generate()
            
            if init:
                print initialDesign.toJSON()
            #import pycallgraph
            #pycallgraph.start_trace()
            
            initialCost = self.cm.overallCost(initialDesign)
            
            #pycallgraph.make_dot_graph('d4.png')
            
            return initialCost, initialDesign
        else:
            self.cm.overallCost(replay_design)
            return None
    ## DEF
    
    def search(self, initialCost, initialDesign, worker_id):
        """
            Main search process starts here
        """
        lock = thread.allocate_lock()
        self.search_method = LNSDesigner(self.collections, self.designCandidates, self.workload, self.config, self.cm, initialDesign, initialCost, self.channel, lock, worker_id)
        self.search_method.start()
    ## DEF

## CLASS