def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
                
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            segment_h.put(idx)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
            self.workload_segments[idx].append(sess)
Exemple #2
0
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)
    
    LOG.info("="*50)
    
    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")
    
    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []
    
    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:
            typeCnts.put(op['type'])
            
        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
        vals.append(op_cnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))
        
    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))
    
    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR
    
    LOG.info("="*50)
    
    return
Exemple #3
0
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)

    LOG.info("=" * 50)

    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")

    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []

    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:
            typeCnts.put(op['type'])

        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
        vals.append(op_cnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))

    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))

    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR

    LOG.info("=" * 50)

    return
Exemple #4
0
    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug(
                "Calculating time difference for operations in %d sessions" %
                len(self.sessOps))

        # Get the full list of all the time differences
        allDiffs = []
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))

        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)

        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
            return
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5

        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)

        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    prevOpHist.put(op0["query_hash"])
                    nextOpHist.put(op1["query_hash"])
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)

        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)

        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" %
                      len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)

        self.sessionBoundaries.clear()

        # If we're doing this randomly, we want each session to have roughly
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers * 0.10) == 1 else random.randint(
                1, int(num_outliers * 0.10))
            LOG.warn(
                "Forcing %d random outliers out of %d to be chosen from workload",
                force, num_outliers)
        else:
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
Exemple #5
0
    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug("Calculating time difference for operations in %d sessions" % len(self.sessOps))
        
        # Get the full list of all the time differences
        allDiffs = [ ]
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))
        
        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)
        
        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
            return
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5
        
        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)
        
        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    prevOpHist.put(op0["query_hash"])
                    nextOpHist.put(op1["query_hash"])
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)
        
        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)
        
        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
        
        self.sessionBoundaries.clear()
        
        # If we're doing this randomly, we want each session to have roughly 
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers*0.10) == 1 else random.randint(1, int(num_outliers*0.10))
            LOG.warn("Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers)
        else:
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")
        
        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)
        
        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = [ ]
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1
        
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)
        
        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)
            
            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps
            
            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)
            
        return
    ## DEF
## CLASS
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")

        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)

        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = []
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1

        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)

        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)

            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s",
                              pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" %
                              (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps

            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)

        return

    ## DEF


## CLASS