Exemple #1
0
    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get('weight_network', 1.0)
        self.weight_disk = config.get('weight_disk', 1.0)
        self.weight_skew = config.get('weight_skew', 1.0)
        self.num_nodes = config.get('nodes', 1)

        # Convert MB to bytes
        self.max_memory = config['max_memory'] * 1024 * 1024
        self.skew_segments = config['skew_intervals']  # Why? "- 1"
        self.address_size = config['address_size'] / 4

        self.estimator = NodeEstimator(collections, self.num_nodes)

        self.window_size = config['window_size']

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}
Exemple #2
0
    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get("weight_network", 1.0)
        self.weight_disk = config.get("weight_disk", 1.0)
        self.weight_skew = config.get("weight_skew", 1.0)
        self.num_nodes = config.get("nodes", 1)

        # Convert MB to bytes
        self.max_memory = config["max_memory"] * 1024 * 1024
        self.skew_segments = config["skew_intervals"]  # Why? "- 1"
        self.address_size = config["address_size"] / 4

        self.estimator = NodeEstimator(collections, self.num_nodes)

        self.window_size = config["window_size"]

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}
Exemple #3
0
class State:
    """Cost Model State"""

    ## -----------------------------------------------------------------------
    ## INTERNAL CACHE STATE
    ## -----------------------------------------------------------------------

    class Cache:
        """
            Internal cache for a single collection.
            Note that this is different than the LRUBuffer cache stuff. These are
            cached look-ups that the CostModel uses for figuring out what operations do.
        """

        def __init__(self, col_info, num_nodes):

            # The number of pages needed to do a full scan of this collection
            # The worst case for all other operations is if we have to do
            # a full scan that requires us to evict the entire buffer
            # Hence, we multiple the max pages by two
            # self.fullscan_pages = (col_info['max_pages'] * 2)
            self.fullscan_pages = col_info["doc_count"] * 2
            assert self.fullscan_pages > 0, "Zero max_pages for collection '%s'" % col_info["name"]

            # Cache of Best Index Tuples
            # QueryHash -> BestIndex
            self.best_index = {}

            # Cache of Regex Operations
            # QueryHash -> Boolean
            self.op_regex = {}

            # Cache of Touched Node Ids
            # QueryId -> [NodeId]
            self.op_nodeIds = {}

            # Cache of Document Ids
            # QueryId -> Index/Collection DocumentIds
            self.collection_docIds = {}
            self.index_docIds = {}

        ## DEF

        def reset(self):
            self.best_index.clear()
            self.op_regex.clear()
            self.op_nodeIds.clear()
            self.collection_docIds.clear()
            self.index_docIds.clear()
            self.op_count = 0
            self.msg_count = 0
            self.network_reset = True

        ## DEF

        def __str__(self):
            ret = ""
            max_len = max(map(len, self.__dict__.iterkeys())) + 1
            f = "  %-" + str(max_len) + "s %s\n"
            for k, v in self.__dict__.iteritems():
                if isinstance(v, dict):
                    v_str = "[%d entries]" % len(v)
                else:
                    v_str = str(v)
                ret += f % (k + ":", v_str)
            return ret

        ## DEF

    ## CLASS

    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get("weight_network", 1.0)
        self.weight_disk = config.get("weight_disk", 1.0)
        self.weight_skew = config.get("weight_skew", 1.0)
        self.max_num_nodes = config.get("nodes", 1)
        # Convert MB to bytes
        self.max_memory = config["max_memory"] * 1024 * 1024
        self.skew_segments = config["skew_intervals"]  # Why? "- 1"
        self.address_size = config["address_size"] / 4

        self.estimator = NodeEstimator(collections, self.max_num_nodes)

        self.window_size = config["window_size"]

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}

    ## DEF

    def init_xref(self, workload):
        """
            initialize the cross reference based on the current working workload
        """
        self.col_sess_xref = dict([(col_name, []) for col_name in self.col_names])
        self.col_op_xref = dict([(col_name, []) for col_name in self.col_names])
        self.__buildCrossReference__(workload)

    ## DEF

    def updateWorkload(self, workload):
        self.workload = workload
        self.init_xref(workload)

    ## DEF

    def restoreOriginalWorkload(self):
        self.workload = self.originalWorload
        self.init_xref(self.workload)

    ## DEF

    def __buildCrossReference__(self, workload):
        for sess in workload:
            cols = set()
            for op in sess["operations"]:
                col_name = op["collection"]
                if col_name in self.col_sess_xref:
                    self.col_op_xref[col_name].append(op)
                    cols.add(col_name)
            ## FOR (op)
            for col_name in cols:
                self.col_sess_xref[col_name].append(sess)
        ## FOR (sess)

    def invalidateCache(self, col_name):
        if col_name in self.cache_handles:
            if self.debug:
                LOG.debug("Invalidating cache for collection '%s'", col_name)
            self.cache_handles[col_name].reset()

    ## DEF

    def getCacheHandleByName(self, col_info):
        """
            Return a cache handle for the given collection name.
            This is the preferrred method because it requires fewer hashes
        """
        cache = self.cache_handles.get(col_info["name"], None)
        if cache is None:
            cache = State.Cache(col_info, self.max_num_nodes)
            self.cache_handles[col_info["name"]] = cache
        return cache

    ## DEF

    def getCacheHandle(self, col_info):
        return self.getCacheHandleByName(col_info)

    ## DEF

    def reset(self):
        """
            Reset all of the internal state and cache information
        """
        # Clear out caches for all collections
        self.cache_handles.clear()
        self.estimator.reset()

    def calcNumNodes(self, design, maxCardinality):
        num_nodes = {}
        for col_name in self.collections.keys():
            num_nodes[col_name] = self.max_num_nodes
            if maxCardinality[col_name] is not None and design.hasCollection(col_name):
                cardinality = 1
                shard_keys = design.getShardKeys(col_name)
                if shard_keys is None or len(shard_keys) == 0:
                    continue
                for shard_key in shard_keys:
                    if (not self.collections[col_name]["fields"].has_key(shard_key)) or (
                        not self.collections[col_name]["fields"][shard_key].has_key("cardinality")
                    ):
                        continue
                    field_cardinality = self.collections[col_name]["fields"][shard_key]["cardinality"]
                    if field_cardinality > 0:
                        cardinality *= field_cardinality
                cardinality_ratio = maxCardinality[col_name] / float(cardinality)
                if cardinality_ratio == 1:
                    cardinality_ratio = 0
                elif cardinality_ratio < 2:
                    cardinality_ratio = 1
                else:
                    cardinality_ratio = int(math.ceil(math.log(cardinality_ratio, 2)))
                col_num_nodes = self.max_num_nodes - cardinality_ratio
                if col_num_nodes <= 0:
                    col_num_nodes = 1
                num_nodes[col_name] = col_num_nodes
        return num_nodes

    ## -----------------------------------------------------------------------
    ## UTILITY CODE
    ## -----------------------------------------------------------------------

    def __getIsOpRegex__(self, cache, op):
        isRegex = cache.op_regex.get(op["query_hash"], None)
        if isRegex is None:
            isRegex = workload.isOpRegex(op)
            if self.cache_enable:
                if self.debug:
                    self.cache_miss_ctr.put("op_regex")
                cache.op_regex[op["query_hash"]] = isRegex
        elif self.debug:
            self.cache_hit_ctr.put("op_regex")
        return isRegex

    ## DEF

    def __getNodeIds__(self, cache, design, op, num_nodes=None):
        node_ids = cache.op_nodeIds.get(op["query_id"], None)
        if node_ids is None:
            try:
                node_ids = self.estimator.estimateNodes(design, op, num_nodes)
            except:
                if self.debug:
                    LOG.error("Failed to estimate touched nodes for op #%d\n%s", op["query_id"], pformat(op))
                raise
            if self.cache_enable:
                if self.debug:
                    self.cache_miss_ctr.put("op_nodeIds")
                cache.op_nodeIds[op["query_id"]] = node_ids
            if self.debug:
                LOG.debug("Estimated Touched Nodes for Op #%d: %d", op["query_id"], len(node_ids))
        elif self.debug:
            self.cache_hit_ctr.put("op_nodeIds")
        return node_ids
Exemple #4
0
class State():
    """Cost Model State"""

    ## -----------------------------------------------------------------------
    ## INTERNAL CACHE STATE
    ## -----------------------------------------------------------------------

    class Cache():
        """
            Internal cache for a single collection.
            Note that this is different than the LRUBuffer cache stuff. These are
            cached look-ups that the CostModel uses for figuring out what operations do.
        """
        def __init__(self, col_info, num_nodes):

            # The number of pages needed to do a full scan of this collection
            # The worst case for all other operations is if we have to do
            # a full scan that requires us to evict the entire buffer
            # Hence, we multiple the max pages by two
            # self.fullscan_pages = (col_info['max_pages'] * 2)
            self.fullscan_pages = col_info['doc_count'] * 2
            assert self.fullscan_pages > 0,\
                "Zero max_pages for collection '%s'" % col_info['name']

            # Cache of Best Index Tuples
            # QueryHash -> BestIndex
            self.best_index = {}

            # Cache of Regex Operations
            # QueryHash -> Boolean
            self.op_regex = {}

            # Cache of Touched Node Ids
            # QueryId -> [NodeId]
            self.op_nodeIds = {}

            # Cache of Document Ids
            # QueryId -> Index/Collection DocumentIds
            self.collection_docIds = {}
            self.index_docIds = {}

        ## DEF

        def reset(self):
            self.best_index.clear()
            self.op_regex.clear()
            self.op_nodeIds.clear()
            self.collection_docIds.clear()
            self.index_docIds.clear()
            self.op_count = 0
            self.msg_count = 0
            self.network_reset = True

        ## DEF

        def __str__(self):
            ret = ""
            max_len = max(map(len, self.__dict__.iterkeys())) + 1
            f = "  %-" + str(max_len) + "s %s\n"
            for k, v in self.__dict__.iteritems():
                if isinstance(v, dict):
                    v_str = "[%d entries]" % len(v)
                else:
                    v_str = str(v)
                ret += f % (k + ":", v_str)
            return ret

        ## DEF

    ## CLASS

    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get('weight_network', 1.0)
        self.weight_disk = config.get('weight_disk', 1.0)
        self.weight_skew = config.get('weight_skew', 1.0)
        self.num_nodes = config.get('nodes', 1)

        # Convert MB to bytes
        self.max_memory = config['max_memory'] * 1024 * 1024
        self.skew_segments = config['skew_intervals']  # Why? "- 1"
        self.address_size = config['address_size'] / 4

        self.estimator = NodeEstimator(collections, self.num_nodes)

        self.window_size = config['window_size']

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}

    ## DEF

    def init_xref(self, workload):
        '''
            initialize the cross reference based on the current working workload
        '''
        self.col_sess_xref = dict([(col_name, [])
                                   for col_name in self.col_names])
        self.col_op_xref = dict([(col_name, [])
                                 for col_name in self.col_names])
        self.__buildCrossReference__(workload)

    ## DEF

    def updateWorkload(self, workload):
        self.workload = workload
        self.init_xref(workload)

    ## DEF

    def restoreOriginalWorkload(self):
        self.workload = self.originalWorload
        self.init_xref(self.workload)

    ## DEF

    def __buildCrossReference__(self, workload):
        for sess in workload:
            cols = set()
            for op in sess["operations"]:
                col_name = op["collection"]
                if col_name in self.col_sess_xref:
                    self.col_op_xref[col_name].append(op)
                    cols.add(col_name)
            ## FOR (op)
            for col_name in cols:
                self.col_sess_xref[col_name].append(sess)
        ## FOR (sess)

    def invalidateCache(self, col_name):
        if col_name in self.cache_handles:
            if self.debug:
                LOG.debug("Invalidating cache for collection '%s'", col_name)
            self.cache_handles[col_name].reset()

    ## DEF

    def getCacheHandleByName(self, col_info):
        """
            Return a cache handle for the given collection name.
            This is the preferrred method because it requires fewer hashes
        """
        cache = self.cache_handles.get(col_info['name'], None)
        if cache is None:
            cache = State.Cache(col_info, self.num_nodes)
            self.cache_handles[col_info['name']] = cache
        return cache

    ## DEF

    def getCacheHandle(self, col_info):
        return self.getCacheHandleByName(col_info)

    ## DEF

    def reset(self):
        """
            Reset all of the internal state and cache information
        """
        # Clear out caches for all collections
        self.cache_handles.clear()
        self.estimator.reset()

    ## -----------------------------------------------------------------------
    ## UTILITY CODE
    ## -----------------------------------------------------------------------

    def __getIsOpRegex__(self, cache, op):
        isRegex = cache.op_regex.get(op["query_hash"], None)
        if isRegex is None:
            isRegex = workload.isOpRegex(op)
            if self.cache_enable:
                if self.debug: self.cache_miss_ctr.put("op_regex")
                cache.op_regex[op["query_hash"]] = isRegex
        elif self.debug:
            self.cache_hit_ctr.put("op_regex")
        return isRegex

    ## DEF

    def __getNodeIds__(self, cache, design, op):
        node_ids = cache.op_nodeIds.get(op['query_id'], None)
        if node_ids is None:
            try:
                node_ids = self.estimator.estimateNodes(design, op)
            except:
                if self.debug:
                    LOG.error(
                        "Failed to estimate touched nodes for op #%d\n%s",
                        op['query_id'], pformat(op))
                raise
            if self.cache_enable:
                if self.debug: self.cache_miss_ctr.put("op_nodeIds")
                cache.op_nodeIds[op['query_id']] = node_ids
            if self.debug:
                LOG.debug("Estimated Touched Nodes for Op #%d: %d",
                          op['query_id'], len(node_ids))
        elif self.debug:
            self.cache_hit_ctr.put("op_nodeIds")
        return node_ids

    ## DEF


## CLASS