def splitWorkload(self): """Divide the workload up into segments for skew analysis""" start_time = None end_time = None for i in xrange(len(self.state.workload)): if start_time is None or start_time > self.state.workload[i]['start_time']: start_time = self.state.workload[i]['start_time'] if end_time is None or end_time < self.state.workload[i]['end_time']: end_time = self.state.workload[i]['end_time'] assert not start_time is None,\ "Failed to find start time in %d sessions" % len(self.state.workload) assert not end_time is None,\ "Failed to find end time in %d sessions" % len(self.state.workload) if self.debug: LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time) self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ] segment_h = Histogram() for sess in self.state.workload: idx = self.getSessionSegment(sess, start_time, end_time) segment_h.put(idx) assert idx >= 0 and idx < self.state.skew_segments,\ "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h) self.workload_segments[idx].append(sess)
def __init__(self, state): AbstractCostComponent.__init__(self, state) self.debug = LOG.isEnabledFor(logging.DEBUG) # Keep track of how many times that we accessed each node self.nodeCounts = Histogram() self.workload_segments = [ ] # Pre-split the workload into separate intervals self.splitWorkload()
def plot(self): reneg_bins = self.renegotiate() self.ranks_data = flatten(self.ranks_data) ref_hist = Histogram(data=self.ranks_produced_flattened, nbins=self.num_pivots_sent) ref_hist.rebalance(self.num_bins_final) cur_hist = Histogram(data=self.ranks_data, bin_edges=reneg_bins.bin_edges) fig, ax = plt.subplots() plot1 = ax.bar(range(32), cur_hist.hist) mean_load = len(self.ranks_data) / 32 ax.plot([-1, 32], [mean_load, mean_load], color='orange', linewidth=1) ax.text(21, mean_load * 1.05, 'Ideal (balanced) load', color='#c04e01') ax.set_xlabel("Rank ID") ax.set_ylabel("Load") plt.tight_layout() plt.savefig("../vis/ASCR/naive_lb_2.pdf")
def generateCollectionHistograms(self): col_keys = dict([(col_name, Histogram()) for col_name in self.collections]) for sess in self.workload: for op in sess["operations"]: if op["collection"].find("$cmd") != -1: continue if not op["collection"] in col_keys: LOG.warn("Missing: " + op["collection"]) continue fields = workload.getReferencedFields(op) h = col_keys[op["collection"]] for i in xrange(1, len(fields) + 1): map(h.put, itertools.combinations(fields, i)) ## FOR (op) ## FOR (sess) return (col_keys)
def print_stats(host, port, w_db, w_col): print "" LOG.info("..:: MongoDesigner Workload Info ::..") print "" #start connection and set global variables... connection = initDB(host, port, w_db, w_col) LOG.info("=" * 50) session_cnt = workload_db[workload_col].find().count() LOG.info("Number of sessions: %d", session_cnt) LOG.info("Number of operations per session:") maxOpCnt = 0 minOpCnt = sys.maxint vals = [] typeCnts = Histogram() for session in workload_db[workload_col].find(): for op in session['operations']: typeCnts.put(op['type']) op_cnt = len(session['operations']) minOpCnt = min(op_cnt, minOpCnt) maxOpCnt = max(op_cnt, maxOpCnt) vals.append(op_cnt) ## FOR avgOpCnt = None if vals: avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals)) LOG.info("%10s: %d" % ("min", minOpCnt)) LOG.info("%10s: %d" % ("max", maxOpCnt)) LOG.info("%10s: %s" % ("avg", avgOpCnt)) LOG.info("Number of operations by type:") for opType in typeCnts.values(): LOG.info("%10s: %d" % (opType, typeCnts[opType])) ## FOR LOG.info("=" * 50) return
def gen_hist(percent_particles): all_data_len = len(all_data) data_len_1p = int(all_data_len * percent_particles) print("Generating hist for %s, %s" % (all_data_len * percent_particles, percent_particles)) sub_data = all_data[:data_len_1p] data_hist = Histogram(sub_data, 512) # print("Sub data: ", len(sub_data)) # print("Data Hist: ", data_hist.bin_edges[-1], # data_hist.hist[-1], sum(data_hist.hist)) global accurate_hist accurate_hist, mass_per_bin = data_hist._rebalance(32) # print("Accuurate Hist: ", accurate_hist[-1], # mass_per_bin, sum(data_hist.hist)) # print(data_hist.bin_edges, data_hist.hist) # print(accurate_hist) del data_hist
def calculateSessions(self): # Calculate outliers using the quartile method # http://en.wikipedia.org/wiki/Quartile#Computing_methods if self.debug: LOG.debug( "Calculating time difference for operations in %d sessions" % len(self.sessOps)) # Get the full list of all the time differences allDiffs = [] for clientOps in self.sessOps.values(): allDiffs += [x[-1] for x in clientOps] allDiffs = sorted(allDiffs) numDiffs = len(allDiffs) #print "\n".join(map(str, allDiffs)) # Lower + Upper Quartiles lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs) if lowerQuartile is None or upperQuartile is None: LOG.warn("Null quartiles! Can't continue!") return # Interquartile Range iqr = (upperQuartile - lowerQuartile) * 1.5 if self.debug: LOG.debug("Calculating stats for %d op pairs" % len(allDiffs)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) LOG.debug(" IQR: %s" % iqr) # Go through operations for each client and identify the # pairs of operations that are above the IQR in the upperQuartile opHist = Histogram() prevOpHist = Histogram() nextOpHist = Histogram() threshold = upperQuartile + iqr for sessId, clientOps in self.sessOps.iteritems(): for op0, op1, opDiff in clientOps: if opDiff >= threshold: prevOpHist.put(op0["query_hash"]) nextOpHist.put(op1["query_hash"]) opHist.put((op0["query_hash"], op1["query_hash"])) ## FOR ## FOR if self.debug: LOG.debug("Outlier Op Hashes:\n%s" % opHist) # I guess at this point we can just compute the outliers # again for the pairs of operations that have a time difference # outlier. We won't use the IQR. We'll just take the upper quartile # because that seems to give us the right answer outlierCounts = sorted(opHist.getCounts()) lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts) if self.debug: LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) self.sessionBoundaries.clear() # If we're doing this randomly, we want each session to have roughly # the same number of operations as RANDOMIZE_TARGET if self.randomize: num_outliers = len(outlierCounts) force = 1 if int(num_outliers * 0.10) == 1 else random.randint( 1, int(num_outliers * 0.10)) LOG.warn( "Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers) else: force = 0 for cnt in outlierCounts: if cnt >= upperQuartile or (self.randomize and force > 0): self.sessionBoundaries |= set(opHist.getValuesForCount(cnt)) force -= 1 ## FOR LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS