def testPickle(self):
        h = Histogram()
        letters = [x for x in string.letters] + ["-"]

        for i in xrange(0, 100):
            key = ""
            for x in xrange(0, 10):
                key += random.choice(letters)
            assert len(key) > 0

            h.put(key, delta=random.randint(1, 10))
            assert h[key] > 0
        ## FOR

        # Serialize
        import pickle

        p = pickle.dumps(h, -1)
        assert p

        # Deserialize
        clone = pickle.loads(p)
        assert clone

        for key in h.keys():
            self.assertEquals(h[key], clone[key])
        ## FOR
        self.assertEquals(h.getSampleCount(), clone.getSampleCount())
        self.assertEquals(sorted(h.getMinCountKeys()), sorted(clone.getMinCountKeys()))
Exemple #2
0
    def testPickle(self):
        h = Histogram()
        letters = [x for x in string.letters] + ["-"]

        for i in xrange(0, 100):
            key = ""
            for x in xrange(0, 10):
                key += random.choice(letters)
            assert len(key) > 0

            h.put(key, delta=random.randint(1, 10))
            assert h[key] > 0
        ## FOR

        # Serialize
        import pickle
        p = pickle.dumps(h, -1)
        assert p

        # Deserialize
        clone = pickle.loads(p)
        assert clone

        for key in h.keys():
            self.assertEquals(h[key], clone[key])
        ## FOR
        self.assertEquals(h.getSampleCount(), clone.getSampleCount())
        self.assertEquals(sorted(h.getMinCountKeys()),
                          sorted(clone.getMinCountKeys()))
Exemple #3
0
def computeInStats(query, h=None):
    for k, v in query.iteritems():
        if k == "#in":
            if h is None: h = Histogram()
            h.put(len(v))
        elif isinstance(v, list):
            for inner in v:
                if isinstance(inner, dict):
                    h = computeInStats(inner, h)
        elif isinstance(v, dict):
            h = computeInStats(v, h)
    return h
Exemple #4
0
def computeInStats(query, h=None):
    for k,v in query.iteritems():
        if k == "#in":
            if h is None: h = Histogram()
            h.put(len(v))
        elif isinstance(v, list):
            for inner in v:
                if isinstance(inner, dict):
                    h = computeInStats(inner, h)
        elif isinstance(v, dict):
            h = computeInStats(v, h)
    return h
Exemple #5
0
    def fixInvalidCollections(self):
        searchKey = {
            "operations.collection": constants.INVALID_COLLECTION_MARKER
        }
        for session in self.metadata_db.Session.find(searchKey):
            for op in session["operations"]:
                dirty = False
                if op["collection"] != constants.INVALID_COLLECTION_MARKER:
                    continue

                if self.debug:
                    LOG.debug("Attempting to fix corrupted Operation:\n%s" %
                              pformat(op))

                # For each field referenced in the query, build a histogram of
                # which collections have a field with the same name
                fields = workload.getReferencedFields(op)
                h = Histogram()
                for c in self.metadata_db.Collection.find():
                    for f in c['fields']:
                        if f in fields:
                            h.put(c['name'])
                    ## FOR
                ## FOR

                matches = h.getMaxCountKeys()
                if len(matches) == 0:
                    LOG.warn(
                        "No matching collection was found for corrupted operation\n%s"
                        % pformat(op))
                    continue
                elif len(matches) > 1:
                    LOG.warn(
                        "More than one matching collection was found for corrupted operation %s\n%s"
                        % (matches, pformat(op)))
                    continue
                else:
                    op["collection"] = matches[0]
                    dirty = True
                    self.fix_ctr += 1
                    LOG.info("Fix corrupted collection in operation\n%s" %
                             pformat(op))
                    ## IF
                    ## FOR (operations)

            if dirty: session.save()
    def fixInvalidCollections(self):
        searchKey = {"operations.collection": constants.INVALID_COLLECTION_MARKER}
        for session in self.metadata_db.Session.find(searchKey):
            for op in session["operations"]:
                dirty = False
                if op["collection"] != constants.INVALID_COLLECTION_MARKER:
                    continue

                if self.debug:
                    LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op))

                # For each field referenced in the query, build a histogram of
                # which collections have a field with the same name
                fields = workload.getReferencedFields(op)
                h = Histogram()
                for c in self.metadata_db.Collection.find():
                    for f in c["fields"]:
                        if f in fields:
                            h.put(c["name"])
                    ## FOR
                ## FOR

                matches = h.getMaxCountKeys()
                if len(matches) == 0:
                    LOG.warn("No matching collection was found for corrupted operation\n%s" % pformat(op))
                    continue
                elif len(matches) > 1:
                    LOG.warn(
                        "More than one matching collection was found for corrupted operation %s\n%s"
                        % (matches, pformat(op))
                    )
                    continue
                else:
                    op["collection"] = matches[0]
                    dirty = True
                    self.fix_ctr += 1
                    LOG.info("Fix corrupted collection in operation\n%s" % pformat(op))
                    ## IF
                    ## FOR (operations)

            if dirty:
                session.save()
Exemple #7
0
class State:
    """Cost Model State"""

    ## -----------------------------------------------------------------------
    ## INTERNAL CACHE STATE
    ## -----------------------------------------------------------------------

    class Cache:
        """
            Internal cache for a single collection.
            Note that this is different than the LRUBuffer cache stuff. These are
            cached look-ups that the CostModel uses for figuring out what operations do.
        """

        def __init__(self, col_info, num_nodes):

            # The number of pages needed to do a full scan of this collection
            # The worst case for all other operations is if we have to do
            # a full scan that requires us to evict the entire buffer
            # Hence, we multiple the max pages by two
            # self.fullscan_pages = (col_info['max_pages'] * 2)
            self.fullscan_pages = col_info["doc_count"] * 2
            assert self.fullscan_pages > 0, "Zero max_pages for collection '%s'" % col_info["name"]

            # Cache of Best Index Tuples
            # QueryHash -> BestIndex
            self.best_index = {}

            # Cache of Regex Operations
            # QueryHash -> Boolean
            self.op_regex = {}

            # Cache of Touched Node Ids
            # QueryId -> [NodeId]
            self.op_nodeIds = {}

            # Cache of Document Ids
            # QueryId -> Index/Collection DocumentIds
            self.collection_docIds = {}
            self.index_docIds = {}

        ## DEF

        def reset(self):
            self.best_index.clear()
            self.op_regex.clear()
            self.op_nodeIds.clear()
            self.collection_docIds.clear()
            self.index_docIds.clear()
            self.op_count = 0
            self.msg_count = 0
            self.network_reset = True

        ## DEF

        def __str__(self):
            ret = ""
            max_len = max(map(len, self.__dict__.iterkeys())) + 1
            f = "  %-" + str(max_len) + "s %s\n"
            for k, v in self.__dict__.iteritems():
                if isinstance(v, dict):
                    v_str = "[%d entries]" % len(v)
                else:
                    v_str = str(v)
                ret += f % (k + ":", v_str)
            return ret

        ## DEF

    ## CLASS

    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get("weight_network", 1.0)
        self.weight_disk = config.get("weight_disk", 1.0)
        self.weight_skew = config.get("weight_skew", 1.0)
        self.max_num_nodes = config.get("nodes", 1)
        # Convert MB to bytes
        self.max_memory = config["max_memory"] * 1024 * 1024
        self.skew_segments = config["skew_intervals"]  # Why? "- 1"
        self.address_size = config["address_size"] / 4

        self.estimator = NodeEstimator(collections, self.max_num_nodes)

        self.window_size = config["window_size"]

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}

    ## DEF

    def init_xref(self, workload):
        """
            initialize the cross reference based on the current working workload
        """
        self.col_sess_xref = dict([(col_name, []) for col_name in self.col_names])
        self.col_op_xref = dict([(col_name, []) for col_name in self.col_names])
        self.__buildCrossReference__(workload)

    ## DEF

    def updateWorkload(self, workload):
        self.workload = workload
        self.init_xref(workload)

    ## DEF

    def restoreOriginalWorkload(self):
        self.workload = self.originalWorload
        self.init_xref(self.workload)

    ## DEF

    def __buildCrossReference__(self, workload):
        for sess in workload:
            cols = set()
            for op in sess["operations"]:
                col_name = op["collection"]
                if col_name in self.col_sess_xref:
                    self.col_op_xref[col_name].append(op)
                    cols.add(col_name)
            ## FOR (op)
            for col_name in cols:
                self.col_sess_xref[col_name].append(sess)
        ## FOR (sess)

    def invalidateCache(self, col_name):
        if col_name in self.cache_handles:
            if self.debug:
                LOG.debug("Invalidating cache for collection '%s'", col_name)
            self.cache_handles[col_name].reset()

    ## DEF

    def getCacheHandleByName(self, col_info):
        """
            Return a cache handle for the given collection name.
            This is the preferrred method because it requires fewer hashes
        """
        cache = self.cache_handles.get(col_info["name"], None)
        if cache is None:
            cache = State.Cache(col_info, self.max_num_nodes)
            self.cache_handles[col_info["name"]] = cache
        return cache

    ## DEF

    def getCacheHandle(self, col_info):
        return self.getCacheHandleByName(col_info)

    ## DEF

    def reset(self):
        """
            Reset all of the internal state and cache information
        """
        # Clear out caches for all collections
        self.cache_handles.clear()
        self.estimator.reset()

    def calcNumNodes(self, design, maxCardinality):
        num_nodes = {}
        for col_name in self.collections.keys():
            num_nodes[col_name] = self.max_num_nodes
            if maxCardinality[col_name] is not None and design.hasCollection(col_name):
                cardinality = 1
                shard_keys = design.getShardKeys(col_name)
                if shard_keys is None or len(shard_keys) == 0:
                    continue
                for shard_key in shard_keys:
                    if (not self.collections[col_name]["fields"].has_key(shard_key)) or (
                        not self.collections[col_name]["fields"][shard_key].has_key("cardinality")
                    ):
                        continue
                    field_cardinality = self.collections[col_name]["fields"][shard_key]["cardinality"]
                    if field_cardinality > 0:
                        cardinality *= field_cardinality
                cardinality_ratio = maxCardinality[col_name] / float(cardinality)
                if cardinality_ratio == 1:
                    cardinality_ratio = 0
                elif cardinality_ratio < 2:
                    cardinality_ratio = 1
                else:
                    cardinality_ratio = int(math.ceil(math.log(cardinality_ratio, 2)))
                col_num_nodes = self.max_num_nodes - cardinality_ratio
                if col_num_nodes <= 0:
                    col_num_nodes = 1
                num_nodes[col_name] = col_num_nodes
        return num_nodes

    ## -----------------------------------------------------------------------
    ## UTILITY CODE
    ## -----------------------------------------------------------------------

    def __getIsOpRegex__(self, cache, op):
        isRegex = cache.op_regex.get(op["query_hash"], None)
        if isRegex is None:
            isRegex = workload.isOpRegex(op)
            if self.cache_enable:
                if self.debug:
                    self.cache_miss_ctr.put("op_regex")
                cache.op_regex[op["query_hash"]] = isRegex
        elif self.debug:
            self.cache_hit_ctr.put("op_regex")
        return isRegex

    ## DEF

    def __getNodeIds__(self, cache, design, op, num_nodes=None):
        node_ids = cache.op_nodeIds.get(op["query_id"], None)
        if node_ids is None:
            try:
                node_ids = self.estimator.estimateNodes(design, op, num_nodes)
            except:
                if self.debug:
                    LOG.error("Failed to estimate touched nodes for op #%d\n%s", op["query_id"], pformat(op))
                raise
            if self.cache_enable:
                if self.debug:
                    self.cache_miss_ctr.put("op_nodeIds")
                cache.op_nodeIds[op["query_id"]] = node_ids
            if self.debug:
                LOG.debug("Estimated Touched Nodes for Op #%d: %d", op["query_id"], len(node_ids))
        elif self.debug:
            self.cache_hit_ctr.put("op_nodeIds")
        return node_ids
Exemple #8
0
    colls = dict()
    for col_info in metadata_db.Collection.fetch(
        {"workload_queries": {
            "$gt": 0
        }}):
        # Skip any collection that doesn't have any documents in it
        if not col_info['doc_count'] or not col_info['avg_doc_size']:
            continue
        colls[col_info['name']] = col_info
    if not colls:
        raise Exception("No collections were found in metadata catalog")

    for sess in metadata_db.Session.fetch():
        for op in sess["operations"]:
            QUERY_COUNTS.put(op["query_hash"])
            if not op["query_hash"] in QUERY_HASH_XREF:
                QUERY_HASH_XREF[op["query_hash"]] = []
            QUERY_HASH_XREF[op["query_hash"]].append(op)
            QUERY_COLLECTION_COUNTS.put(op["collection"])
        ## FOR
    ## FOR

    LOG.info("Toal # of Unique Queries: %d", len(QUERY_COUNTS.values()))
    TOTAL_DB_SIZE = sum(
        [col_info["data_size"] for col_info in colls.itervalues()])
    LOG.debug("Estimated Total Database Size: %d" % TOTAL_DB_SIZE)
    TOTAL_QUERY_COUNT = QUERY_COLLECTION_COUNTS.getSampleCount()
    LOG.debug("Total # of Queries: %d" % TOTAL_QUERY_COUNT)

    # HACK: Fix collections
Exemple #9
0
    def hash(self, op):
        """Compute a deterministic signature for the given operation based on its keys"""
        
        fields = None
        updateFields = None
        
        # QUERY
        if op["type"] == constants.OP_TYPE_QUERY:
            # The query field has our where clause
            if not "#query" in op["query_content"][0]:
                msg = "Missing query field in query_content for operation #%d" % op["query_id"]
                if self.debug: LOG.warn(pformat(op))
                raise Exception(msg)

            fields = op["query_content"][0][constants.REPLACE_KEY_DOLLAR_PREFIX + "query"]

        # UPDATE
        elif op["type"] == constants.OP_TYPE_UPDATE:
            # The first element in the content field is the WHERE clause
            fields = op["query_content"][0]
            
            # We use a separate field for the updated columns so that 
            updateFields = op['query_content'][1]

        # INSERT
        elif op["type"] == constants.OP_TYPE_INSERT:
            # They could be inserting more than one document here,
            # which all may have different fields...
            # So we will need to build a histogram for which keys are referenced
            # and use the onese that appear the most
            # XXX: We'll only consider keys in the first-level
            h = Histogram()
            for doc in op["query_content"]:
                assert type(doc) == dict, "Unexpected insert value:\n%s" % pformat(doc)
                for k in doc.keys():
                    h.put(k)
            ## FOR
            if LOG.isEnabledFor(logging.DEBUG):
                LOG.debug("Insert '%s' Keys Histogram:\n%s" % (op["collection"], h))
            maxKeys = h.getMaxCountKeys()
            assert len(maxKeys) > 0, \
                "No keys were found in %d insert documents?" % len(op["query_content"])
            
            fields = { }
            for doc in op["query_content"]:
                for k, v in doc.iteritems():
                    if k in maxKeys:
                        fields[k] = v
                ## FOR
            ## FOR
            
        # DELETE
        elif op["type"] == constants.OP_TYPE_DELETE:
            # The first element in the content field is the WHERE clause
            fields = op["query_content"][0]
        # UNKNOWN!
        else:
            raise Exception("Unexpected query type: %s" % op["type"])
        
        # Extract the list of fields that are used
        try:
            fieldsHash = self.computeFieldsHash(fields)
        except:
            LOG.error("Unexpected error when processing operation %d [fields=%s]" % (op["query_id"], str(fields)))
            raise
        updateHash = self.computeFieldsHash(updateFields) if updateFields else None
        
        t = (op["collection"], op["type"], fieldsHash, updateHash)
        h = long(hash(t))
        LOG.debug("%s %s => HASH:%d" % (fields, t, h))
        self.histogram.put(h)
        return h
Exemple #10
0
class State():
    """Cost Model State"""

    ## -----------------------------------------------------------------------
    ## INTERNAL CACHE STATE
    ## -----------------------------------------------------------------------

    class Cache():
        """
            Internal cache for a single collection.
            Note that this is different than the LRUBuffer cache stuff. These are
            cached look-ups that the CostModel uses for figuring out what operations do.
        """
        def __init__(self, col_info, num_nodes):

            # The number of pages needed to do a full scan of this collection
            # The worst case for all other operations is if we have to do
            # a full scan that requires us to evict the entire buffer
            # Hence, we multiple the max pages by two
            # self.fullscan_pages = (col_info['max_pages'] * 2)
            self.fullscan_pages = col_info['doc_count'] * 2
            assert self.fullscan_pages > 0,\
                "Zero max_pages for collection '%s'" % col_info['name']

            # Cache of Best Index Tuples
            # QueryHash -> BestIndex
            self.best_index = {}

            # Cache of Regex Operations
            # QueryHash -> Boolean
            self.op_regex = {}

            # Cache of Touched Node Ids
            # QueryId -> [NodeId]
            self.op_nodeIds = {}

            # Cache of Document Ids
            # QueryId -> Index/Collection DocumentIds
            self.collection_docIds = {}
            self.index_docIds = {}

        ## DEF

        def reset(self):
            self.best_index.clear()
            self.op_regex.clear()
            self.op_nodeIds.clear()
            self.collection_docIds.clear()
            self.index_docIds.clear()
            self.op_count = 0
            self.msg_count = 0
            self.network_reset = True

        ## DEF

        def __str__(self):
            ret = ""
            max_len = max(map(len, self.__dict__.iterkeys())) + 1
            f = "  %-" + str(max_len) + "s %s\n"
            for k, v in self.__dict__.iteritems():
                if isinstance(v, dict):
                    v_str = "[%d entries]" % len(v)
                else:
                    v_str = str(v)
                ret += f % (k + ":", v_str)
            return ret

        ## DEF

    ## CLASS

    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get('weight_network', 1.0)
        self.weight_disk = config.get('weight_disk', 1.0)
        self.weight_skew = config.get('weight_skew', 1.0)
        self.num_nodes = config.get('nodes', 1)

        # Convert MB to bytes
        self.max_memory = config['max_memory'] * 1024 * 1024
        self.skew_segments = config['skew_intervals']  # Why? "- 1"
        self.address_size = config['address_size'] / 4

        self.estimator = NodeEstimator(collections, self.num_nodes)

        self.window_size = config['window_size']

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}

    ## DEF

    def init_xref(self, workload):
        '''
            initialize the cross reference based on the current working workload
        '''
        self.col_sess_xref = dict([(col_name, [])
                                   for col_name in self.col_names])
        self.col_op_xref = dict([(col_name, [])
                                 for col_name in self.col_names])
        self.__buildCrossReference__(workload)

    ## DEF

    def updateWorkload(self, workload):
        self.workload = workload
        self.init_xref(workload)

    ## DEF

    def restoreOriginalWorkload(self):
        self.workload = self.originalWorload
        self.init_xref(self.workload)

    ## DEF

    def __buildCrossReference__(self, workload):
        for sess in workload:
            cols = set()
            for op in sess["operations"]:
                col_name = op["collection"]
                if col_name in self.col_sess_xref:
                    self.col_op_xref[col_name].append(op)
                    cols.add(col_name)
            ## FOR (op)
            for col_name in cols:
                self.col_sess_xref[col_name].append(sess)
        ## FOR (sess)

    def invalidateCache(self, col_name):
        if col_name in self.cache_handles:
            if self.debug:
                LOG.debug("Invalidating cache for collection '%s'", col_name)
            self.cache_handles[col_name].reset()

    ## DEF

    def getCacheHandleByName(self, col_info):
        """
            Return a cache handle for the given collection name.
            This is the preferrred method because it requires fewer hashes
        """
        cache = self.cache_handles.get(col_info['name'], None)
        if cache is None:
            cache = State.Cache(col_info, self.num_nodes)
            self.cache_handles[col_info['name']] = cache
        return cache

    ## DEF

    def getCacheHandle(self, col_info):
        return self.getCacheHandleByName(col_info)

    ## DEF

    def reset(self):
        """
            Reset all of the internal state and cache information
        """
        # Clear out caches for all collections
        self.cache_handles.clear()
        self.estimator.reset()

    ## -----------------------------------------------------------------------
    ## UTILITY CODE
    ## -----------------------------------------------------------------------

    def __getIsOpRegex__(self, cache, op):
        isRegex = cache.op_regex.get(op["query_hash"], None)
        if isRegex is None:
            isRegex = workload.isOpRegex(op)
            if self.cache_enable:
                if self.debug: self.cache_miss_ctr.put("op_regex")
                cache.op_regex[op["query_hash"]] = isRegex
        elif self.debug:
            self.cache_hit_ctr.put("op_regex")
        return isRegex

    ## DEF

    def __getNodeIds__(self, cache, design, op):
        node_ids = cache.op_nodeIds.get(op['query_id'], None)
        if node_ids is None:
            try:
                node_ids = self.estimator.estimateNodes(design, op)
            except:
                if self.debug:
                    LOG.error(
                        "Failed to estimate touched nodes for op #%d\n%s",
                        op['query_id'], pformat(op))
                raise
            if self.cache_enable:
                if self.debug: self.cache_miss_ctr.put("op_nodeIds")
                cache.op_nodeIds[op['query_id']] = node_ids
            if self.debug:
                LOG.debug("Estimated Touched Nodes for Op #%d: %d",
                          op['query_id'], len(node_ids))
        elif self.debug:
            self.cache_hit_ctr.put("op_nodeIds")
        return node_ids

    ## DEF


## CLASS
Exemple #11
0
class Results:
    
    def __init__(self, config=None):
        self.start = None
        self.stop = None
        self.txn_id = 0
        self.opCount = 0
        self.completed = [ ] # (txnName, timestamp)
        self.txn_counters = Histogram()
        self.txn_times = { }
        self.running = { }
        self.config = config
        
    def startBenchmark(self):
        """Mark the benchmark as having been started"""
        assert self.start == None
        LOG.debug("Starting benchmark statistics collection")
        self.start = time.time()
        return self.start
        
    def stopBenchmark(self):
        """Mark the benchmark as having been stopped"""
        assert self.start != None
        assert self.stop == None
        LOG.debug("Stopping benchmark statistics collection")
        self.stop = time.time()
        
    def startTransaction(self, txn):
        self.txn_id += 1
        id = self.txn_id
        self.running[id] = (txn, time.time())
        return id
        
    def abortTransaction(self, id):
        """Abort a transaction and discard its times"""
        assert id in self.running
        txn_name, txn_start = self.running[id]
        del self.running[id]
        
    def stopTransaction(self, id, opCount, latencies=[]):
        """Record that the benchmark completed an invocation of the given transaction"""
        assert id in self.running
        
        timestamp = time.time()
        
        txn_name, txn_start = self.running[id]
        del self.running[id]
        self.completed.append((txn_name, timestamp, latencies))
        
        duration = timestamp - txn_start
        total_time = self.txn_times.get(txn_name, 0)
        self.txn_times[txn_name] = total_time + duration
        
        # OpCount
        if opCount is not None:
            self.opCount += opCount
        else: 
            LOG.debug("ithappens")
            
        
        # Txn Counter Histogram
        self.txn_counters.put(txn_name)
        assert self.txn_counters[txn_name] > 0
        
        if LOG.isEnabledFor(logging.DEBUG):
            LOG.debug("Completed %s in %f sec" % (txn_name, duration))
    ## DEF

    @staticmethod
    def show_table(title, headers, table, line_width):
        cols_width = [len(header) for header in headers]
        for row in table:
            row_width = 0
            for i in range(len(headers)):
                if len(row[i]) > cols_width[i]:
                    cols_width[i] = len(row[i])
                row_width += cols_width[i]
            row_width += 4 * (len(headers) - 1)
            if row_width > line_width:
                line_width = row_width
        output = ("%s\n" % ("=" * line_width))
        output += ("%s\n" % title)
        output += ("%s\n" % ("-" * line_width))
        for i in range(len(headers)):
            header = headers[i]
            output += ("%s%s" % (header, " " * (cols_width[i] - len(header))))
            if i != len(headers) - 1:
                output += " " * 4
        output += "\n"
        for row in table:
            for i in range(len(headers)):
                cell = row[i]
                output += ("%s%s" % (cell, " " * (cols_width[i] - len(cell))))
                if i != len(headers) - 1:
                    output += " " * 4
            output += "\n"
        output += ("%s\n" % ("-" * line_width))
        return output, line_width

    def show_latencies(self, line_width):
        latencies = []
        output = ""
        for txn_stats in self.completed:
            latencies.extend(txn_stats[2])
        if len(latencies) > 0:
            latencies = sorted(latencies, key=itemgetter(0))
            percents = [0.1, 0.2, 0.5, 0.8, 0.9, 0.999]
            latency_table = []
            slowest_ops = []
            for percent in percents:
                index = int(math.floor(percent * len(latencies)))
                percent_str = "%0.1f%%" % (percent * 100)
                millis_sec_str = "%0.4f" % (latencies[index][0])
                latency_table.append((percent_str, millis_sec_str))
            latency_headers = ["Queries(%)", "Latency(ms)"]
            output, line_width = \
                Results.show_table("Latency Report", latency_headers, latency_table, line_width)
            if self.config is not None and self.config["default"]["slow_ops_num"] > 0:
                num_ops = self.config["default"]["slow_ops_num"]
                slowest_ops_headers = ["#", "Latency(ms)", "Session Id", "Operation Id", "Type", "Collection", "Predicates"]
                for i in range(num_ops):
                    if i < len(latencies):
                        slowest_ops.append([
                            "%d" % i,
                            "%0.4f" % (latencies[len(latencies) - i - 1][0]),
                            str(latencies[len(latencies) - i - 1][1]),
                            str(latencies[len(latencies) - i - 1][2]),
                            latencies[len(latencies) - i - 1][3],
                            latencies[len(latencies) - i - 1][4],
                            json.dumps(latencies[len(latencies) - i - 1][5])
                        ])
                slowest_ops_output, line_width = \
                    Results.show_table("Top %d Slowest Operations" % num_ops, slowest_ops_headers, slowest_ops, line_width)
                output += ("\n%s" % slowest_ops_output)
        return output

    def append(self, r):  
        self.opCount += r.opCount
        for txn_name in r.txn_counters.keys():
            self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name])
            
            orig_time = self.txn_times.get(txn_name, 0)
            self.txn_times[txn_name] = orig_time + r.txn_times[txn_name]
            
            #LOG.info("resOps="+str(r.opCount))
            #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name]))
        ## HACK
        if type(r.completed) == list:
            self.completed.extend(r.completed)
        if not self.start:
            self.start = r.start
        else:
            self.start = min(self.start, r.start)
        if not self.stop:
            self.stop = r.stop
        else:
            self.stop = max(self.stop, r.stop)
    ## DEF
            
    def __str__(self):
        return self.show()
        
    def show(self, load_time = None):
        if self.start == None:
            msg = "Attempting to get benchmark results before it was started"
            raise Exception(msg)
            LOG.warn(msg)
            return "Benchmark not started"
        if self.stop == None:
            duration = time.time() - self.start
        else:
            duration = self.stop - self.start
        
        col_width = 18
        total_width = (col_width*4)+2
        f = "\n  " + (("%-" + str(col_width) + "s")*4)
        line = "-"*total_width

        ret = u"" + "="*total_width + "\n"
        if load_time != None:
            ret += "Data Loading Time: %d seconds\n\n" % (load_time)
        
        ret += "Execution Results after %d seconds\n%s" % (duration, line)
        ret += f % ("", "Executed", u"Total Time (ms)", "Rate") 
        total_time = duration
        total_cnt = self.txn_counters.getSampleCount()
        #total_running_time = 0
        
        for txn in sorted(self.txn_counters.keys()):
            txn_time = self.txn_times[txn]
            txn_cnt = "%6d - %4.1f%%" % (self.txn_counters[txn], (self.txn_counters[txn] / float(total_cnt))*100)
            rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time))
            #total_running_time +=txn_time
            #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time))
            rate = u"%.02f op/s" % ((self.opCount / total_time))
            ret += f % (txn, txn_cnt, str(txn_time * 1000), rate)
            
            #LOG.info("totalOps="+str(self.totalOps))
            # total_time += txn_time
        ret += "\n" + ("-"*total_width)
        
        rate = 0
        if total_time > 0:
            rate = total_cnt / float(total_time)
            # TXN RATE rate = total_cnt / float(total_time)
        #total_rate = "%.02f txn/s" % rate
        total_rate = "%.02f op/s" % rate
        #total_rate = str(rate)
        ret += f % ("TOTAL", str(total_cnt), str(total_time*1000), total_rate)

        return ("%s\n%s" % (ret, self.show_latencies(total_width))).encode('utf-8')
Exemple #12
0
class Results:
    def __init__(self):
        self.start = None
        self.stop = None
        self.txn_id = 0
        self.opCount = 0
        self.completed = []  # (txnName, timestamp)
        self.txn_counters = Histogram()
        self.txn_times = {}
        self.running = {}

    def startBenchmark(self):
        """Mark the benchmark as having been started"""
        assert self.start == None
        LOG.debug("Starting benchmark statistics collection")
        self.start = time.time()
        return self.start

    def stopBenchmark(self):
        """Mark the benchmark as having been stopped"""
        assert self.start != None
        assert self.stop == None
        LOG.debug("Stopping benchmark statistics collection")
        self.stop = time.time()

    def startTransaction(self, txn):
        self.txn_id += 1
        id = self.txn_id
        self.running[id] = (txn, time.time())
        return id

    def abortTransaction(self, id):
        """Abort a transaction and discard its times"""
        assert id in self.running
        txn_name, txn_start = self.running[id]
        del self.running[id]

    def stopTransaction(self, id, opCount):
        """Record that the benchmark completed an invocation of the given transaction"""
        assert id in self.running

        timestamp = time.time()

        txn_name, txn_start = self.running[id]
        del self.running[id]
        self.completed.append((txn_name, timestamp))

        duration = timestamp - txn_start
        total_time = self.txn_times.get(txn_name, 0)
        self.txn_times[txn_name] = total_time + duration

        # OpCount
        if opCount is not None:
            self.opCount += opCount
        else:
            LOG.debug("ithappens")

        # Txn Counter Histogram
        self.txn_counters.put(txn_name)
        assert self.txn_counters[txn_name] > 0

        if LOG.isEnabledFor(logging.DEBUG):
            LOG.debug("Completed %s in %f sec" % (txn_name, duration))

    ## DEF

    def append(self, r):
        self.opCount += r.opCount
        for txn_name in r.txn_counters.keys():
            self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name])

            orig_time = self.txn_times.get(txn_name, 0)
            self.txn_times[txn_name] = orig_time + r.txn_times[txn_name]

            #LOG.info("resOps="+str(r.opCount))
            #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name]))
        ## HACK
        if type(r.completed) == list:
            self.completed.extend(r.completed)
        if not self.start:
            self.start = r.start
        else:
            self.start = min(self.start, r.start)
        if not self.stop:
            self.stop = r.stop
        else:
            self.stop = max(self.stop, r.stop)

    ## DEF

    def __str__(self):
        return self.show()

    def show(self, load_time=None):
        if self.start == None:
            msg = "Attempting to get benchmark results before it was started"
            raise Exception(msg)
            LOG.warn(msg)
            return "Benchmark not started"
        if self.stop == None:
            duration = time.time() - self.start
        else:
            duration = self.stop - self.start

        col_width = 18
        total_width = (col_width * 4) + 2
        f = "\n  " + (("%-" + str(col_width) + "s") * 4)
        line = "-" * total_width

        ret = u"" + "=" * total_width + "\n"
        if load_time != None:
            ret += "Data Loading Time: %d seconds\n\n" % (load_time)

        ret += "Execution Results after %d seconds\n%s" % (duration, line)
        ret += f % ("", "Executed", u"Total Time (ms)", "Rate")
        total_time = duration
        total_cnt = self.txn_counters.getSampleCount()
        #total_running_time = 0

        for txn in sorted(self.txn_counters.keys()):
            txn_time = self.txn_times[txn]
            txn_cnt = "%6d - %4.1f%%" % (
                self.txn_counters[txn],
                (self.txn_counters[txn] / float(total_cnt)) * 100)
            rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time))
            #total_running_time +=txn_time
            #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time))
            #rate = u"%.02f op/s" % ((self.opCount / total_time))
            ret += f % (txn, txn_cnt, str(txn_time * 1000), rate)

            #LOG.info("totalOps="+str(self.totalOps))
            # total_time += txn_time
        ret += "\n" + ("-" * total_width)

        rate = 0
        if total_time > 0:
            rate = total_cnt / float(total_time)
            # TXN RATE rate = total_cnt / float(total_time)
        #total_rate = "%.02f txn/s" % rate
        total_rate = "%.02f op/s" % rate
        #total_rate = str(rate)
        ret += f % ("TOTAL", str(total_cnt), str(
            total_time * 1000), total_rate)

        return (ret.encode('utf-8'))
Exemple #13
0
class Results:
    
    def __init__(self):
        self.start = None
        self.stop = None
        self.txn_id = 0
        self.opCount = 0
        self.completed = [ ] # (txnName, timestamp)
        self.txn_counters = Histogram()
        self.txn_times = { }
        self.running = { }
        
    def startBenchmark(self):
        """Mark the benchmark as having been started"""
        assert self.start == None
        LOG.debug("Starting benchmark statistics collection")
        self.start = time.time()
        return self.start
        
    def stopBenchmark(self):
        """Mark the benchmark as having been stopped"""
        assert self.start != None
        assert self.stop == None
        LOG.debug("Stopping benchmark statistics collection")
        self.stop = time.time()
        
    def startTransaction(self, txn):
        self.txn_id += 1
        id = self.txn_id
        self.running[id] = (txn, time.time())
        return id
        
    def abortTransaction(self, id):
        """Abort a transaction and discard its times"""
        assert id in self.running
        txn_name, txn_start = self.running[id]
        del self.running[id]
        
    def stopTransaction(self, id, opCount):
        """Record that the benchmark completed an invocation of the given transaction"""
        assert id in self.running
        
        timestamp = time.time()
        
        txn_name, txn_start = self.running[id]
        del self.running[id]
        self.completed.append((txn_name, timestamp))
        
        duration = timestamp - txn_start
        total_time = self.txn_times.get(txn_name, 0)
        self.txn_times[txn_name] = total_time + duration
        
        # OpCount
        if opCount is not None:
            self.opCount += opCount
        else: 
            LOG.debug("ithappens")
            
        
        # Txn Counter Histogram
        self.txn_counters.put(txn_name)
        assert self.txn_counters[txn_name] > 0
        
        if LOG.isEnabledFor(logging.DEBUG):
            LOG.debug("Completed %s in %f sec" % (txn_name, duration))
    ## DEF
        
    def append(self, r):  
        self.opCount += r.opCount
        for txn_name in r.txn_counters.keys():
            self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name])
            
            orig_time = self.txn_times.get(txn_name, 0)
            self.txn_times[txn_name] = orig_time + r.txn_times[txn_name]
            
            #LOG.info("resOps="+str(r.opCount))
            #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name]))
        ## HACK
        if type(r.completed) == list:
            self.completed.extend(r.completed)
        if not self.start:
            self.start = r.start
        else:
            self.start = min(self.start, r.start)
        if not self.stop:
            self.stop = r.stop
        else:
            self.stop = max(self.stop, r.stop)
    ## DEF
            
    def __str__(self):
        return self.show()
        
    def show(self, load_time = None):
        if self.start == None:
            msg = "Attempting to get benchmark results before it was started"
            raise Exception(msg)
            LOG.warn(msg)
            return "Benchmark not started"
        if self.stop == None:
            duration = time.time() - self.start
        else:
            duration = self.stop - self.start
        
        col_width = 18
        total_width = (col_width*4)+2
        f = "\n  " + (("%-" + str(col_width) + "s")*4)
        line = "-"*total_width

        ret = u"" + "="*total_width + "\n"
        if load_time != None:
            ret += "Data Loading Time: %d seconds\n\n" % (load_time)
        
        ret += "Execution Results after %d seconds\n%s" % (duration, line)
        ret += f % ("", "Executed", u"Total Time (ms)", "Rate") 
        total_time = duration
        total_cnt = self.txn_counters.getSampleCount()
        #total_running_time = 0
        
        for txn in sorted(self.txn_counters.keys()):
            txn_time = self.txn_times[txn]
            txn_cnt = "%6d - %4.1f%%" % (self.txn_counters[txn], (self.txn_counters[txn] / float(total_cnt))*100)
            rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time))
            #total_running_time +=txn_time
            #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time))
            #rate = u"%.02f op/s" % ((self.opCount / total_time))
            ret += f % (txn, txn_cnt, str(txn_time * 1000), rate)
            
            #LOG.info("totalOps="+str(self.totalOps))
            # total_time += txn_time
        ret += "\n" + ("-"*total_width)
        
        rate = 0
        if total_time > 0:
            rate = total_cnt / float(total_time)
            # TXN RATE rate = total_cnt / float(total_time)
        #total_rate = "%.02f txn/s" % rate
        total_rate = "%.02f op/s" % rate
        #total_rate = str(rate)
        ret += f % ("TOTAL", str(total_cnt), str(total_time*1000), total_rate)

        return (ret.encode('utf-8'))
Exemple #14
0
    ## ----------------------------------------------
    metadata_db = conn[config.get(configutil.SECT_MONGODB, 'metadata_db')]
    dataset_db = conn[config.get(configutil.SECT_MONGODB, 'dataset_db')]

    colls = dict()
    for col_info in metadata_db.Collection.fetch({"workload_queries": {"$gt": 0}}):
        # Skip any collection that doesn't have any documents in it
        if not col_info['doc_count'] or not col_info['avg_doc_size']:
            continue
        colls[col_info['name']] = col_info
    if not colls:
        raise Exception("No collections were found in metadata catalog")
    
    for sess in metadata_db.Session.fetch():
        for op in sess["operations"]:
            QUERY_COUNTS.put(op["query_hash"])
            if not op["query_hash"] in QUERY_HASH_XREF:
                QUERY_HASH_XREF[op["query_hash"]] = [ ]
            QUERY_HASH_XREF[op["query_hash"]].append(op)
            QUERY_COLLECTION_COUNTS.put(op["collection"])
        ## FOR
    ## FOR

    LOG.info("Toal # of Unique Queries: %d", len(QUERY_COUNTS.values()))
    TOTAL_DB_SIZE = sum([col_info["data_size"] for col_info in colls.itervalues()])
    LOG.debug("Estimated Total Database Size: %d" % TOTAL_DB_SIZE)
    TOTAL_QUERY_COUNT = QUERY_COLLECTION_COUNTS.getSampleCount()
    LOG.debug("Total # of Queries: %d" % TOTAL_QUERY_COUNT)
    
    # HACK: Fix collections
    for col_name, col_info in colls.iteritems():