def splitWorkload(self): """Divide the workload up into segments for skew analysis""" start_time = None end_time = None for i in xrange(len(self.state.workload)): if start_time is None or start_time > self.state.workload[i]['start_time']: start_time = self.state.workload[i]['start_time'] if end_time is None or end_time < self.state.workload[i]['end_time']: end_time = self.state.workload[i]['end_time'] assert not start_time is None,\ "Failed to find start time in %d sessions" % len(self.state.workload) assert not end_time is None,\ "Failed to find end time in %d sessions" % len(self.state.workload) if self.debug: LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time) self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ] segment_h = Histogram() for sess in self.state.workload: idx = self.getSessionSegment(sess, start_time, end_time) segment_h.put(idx) assert idx >= 0 and idx < self.state.skew_segments,\ "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h) self.workload_segments[idx].append(sess)
def print_stats(host, port, w_db, w_col): print "" LOG.info("..:: MongoDesigner Workload Info ::..") print "" #start connection and set global variables... connection = initDB(host, port, w_db, w_col) LOG.info("="*50) session_cnt = workload_db[workload_col].find().count() LOG.info("Number of sessions: %d", session_cnt) LOG.info("Number of operations per session:") maxOpCnt = 0 minOpCnt = sys.maxint vals = [] typeCnts = Histogram() for session in workload_db[workload_col].find(): for op in session['operations']: typeCnts.put(op['type']) op_cnt = len(session['operations']) minOpCnt = min(op_cnt, minOpCnt) maxOpCnt = max(op_cnt, maxOpCnt) vals.append(op_cnt) ## FOR avgOpCnt = None if vals: avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals)) LOG.info("%10s: %d" % ("min", minOpCnt)) LOG.info("%10s: %d" % ("max", maxOpCnt)) LOG.info("%10s: %s" % ("avg", avgOpCnt)) LOG.info("Number of operations by type:") for opType in typeCnts.values(): LOG.info("%10s: %d" % (opType, typeCnts[opType])) ## FOR LOG.info("="*50) return
def print_stats(host, port, w_db, w_col): print "" LOG.info("..:: MongoDesigner Workload Info ::..") print "" #start connection and set global variables... connection = initDB(host, port, w_db, w_col) LOG.info("=" * 50) session_cnt = workload_db[workload_col].find().count() LOG.info("Number of sessions: %d", session_cnt) LOG.info("Number of operations per session:") maxOpCnt = 0 minOpCnt = sys.maxint vals = [] typeCnts = Histogram() for session in workload_db[workload_col].find(): for op in session['operations']: typeCnts.put(op['type']) op_cnt = len(session['operations']) minOpCnt = min(op_cnt, minOpCnt) maxOpCnt = max(op_cnt, maxOpCnt) vals.append(op_cnt) ## FOR avgOpCnt = None if vals: avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals)) LOG.info("%10s: %d" % ("min", minOpCnt)) LOG.info("%10s: %d" % ("max", maxOpCnt)) LOG.info("%10s: %s" % ("avg", avgOpCnt)) LOG.info("Number of operations by type:") for opType in typeCnts.values(): LOG.info("%10s: %d" % (opType, typeCnts[opType])) ## FOR LOG.info("=" * 50) return
def calculateSessions(self): # Calculate outliers using the quartile method # http://en.wikipedia.org/wiki/Quartile#Computing_methods if self.debug: LOG.debug( "Calculating time difference for operations in %d sessions" % len(self.sessOps)) # Get the full list of all the time differences allDiffs = [] for clientOps in self.sessOps.values(): allDiffs += [x[-1] for x in clientOps] allDiffs = sorted(allDiffs) numDiffs = len(allDiffs) #print "\n".join(map(str, allDiffs)) # Lower + Upper Quartiles lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs) if lowerQuartile is None or upperQuartile is None: LOG.warn("Null quartiles! Can't continue!") return # Interquartile Range iqr = (upperQuartile - lowerQuartile) * 1.5 if self.debug: LOG.debug("Calculating stats for %d op pairs" % len(allDiffs)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) LOG.debug(" IQR: %s" % iqr) # Go through operations for each client and identify the # pairs of operations that are above the IQR in the upperQuartile opHist = Histogram() prevOpHist = Histogram() nextOpHist = Histogram() threshold = upperQuartile + iqr for sessId, clientOps in self.sessOps.iteritems(): for op0, op1, opDiff in clientOps: if opDiff >= threshold: prevOpHist.put(op0["query_hash"]) nextOpHist.put(op1["query_hash"]) opHist.put((op0["query_hash"], op1["query_hash"])) ## FOR ## FOR if self.debug: LOG.debug("Outlier Op Hashes:\n%s" % opHist) # I guess at this point we can just compute the outliers # again for the pairs of operations that have a time difference # outlier. We won't use the IQR. We'll just take the upper quartile # because that seems to give us the right answer outlierCounts = sorted(opHist.getCounts()) lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts) if self.debug: LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) self.sessionBoundaries.clear() # If we're doing this randomly, we want each session to have roughly # the same number of operations as RANDOMIZE_TARGET if self.randomize: num_outliers = len(outlierCounts) force = 1 if int(num_outliers * 0.10) == 1 else random.randint( 1, int(num_outliers * 0.10)) LOG.warn( "Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers) else: force = 0 for cnt in outlierCounts: if cnt >= upperQuartile or (self.randomize and force > 0): self.sessionBoundaries |= set(opHist.getValuesForCount(cnt)) force -= 1 ## FOR LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
def calculateSessions(self): # Calculate outliers using the quartile method # http://en.wikipedia.org/wiki/Quartile#Computing_methods if self.debug: LOG.debug("Calculating time difference for operations in %d sessions" % len(self.sessOps)) # Get the full list of all the time differences allDiffs = [ ] for clientOps in self.sessOps.values(): allDiffs += [x[-1] for x in clientOps] allDiffs = sorted(allDiffs) numDiffs = len(allDiffs) #print "\n".join(map(str, allDiffs)) # Lower + Upper Quartiles lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs) if lowerQuartile is None or upperQuartile is None: LOG.warn("Null quartiles! Can't continue!") return # Interquartile Range iqr = (upperQuartile - lowerQuartile) * 1.5 if self.debug: LOG.debug("Calculating stats for %d op pairs" % len(allDiffs)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) LOG.debug(" IQR: %s" % iqr) # Go through operations for each client and identify the # pairs of operations that are above the IQR in the upperQuartile opHist = Histogram() prevOpHist = Histogram() nextOpHist = Histogram() threshold = upperQuartile + iqr for sessId, clientOps in self.sessOps.iteritems(): for op0, op1, opDiff in clientOps: if opDiff >= threshold: prevOpHist.put(op0["query_hash"]) nextOpHist.put(op1["query_hash"]) opHist.put((op0["query_hash"], op1["query_hash"])) ## FOR ## FOR if self.debug: LOG.debug("Outlier Op Hashes:\n%s" % opHist) # I guess at this point we can just compute the outliers # again for the pairs of operations that have a time difference # outlier. We won't use the IQR. We'll just take the upper quartile # because that seems to give us the right answer outlierCounts = sorted(opHist.getCounts()) lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts) if self.debug: LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) self.sessionBoundaries.clear() # If we're doing this randomly, we want each session to have roughly # the same number of operations as RANDOMIZE_TARGET if self.randomize: num_outliers = len(outlierCounts) force = 1 if int(num_outliers*0.10) == 1 else random.randint(1, int(num_outliers*0.10)) LOG.warn("Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers) else: force = 0 for cnt in outlierCounts: if cnt >= upperQuartile or (self.randomize and force > 0): self.sessionBoundaries |= set(opHist.getValuesForCount(cnt)) force -= 1 ## FOR LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [ ] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS