class SkewCostComponent(AbstractCostComponent): def __init__(self, state): AbstractCostComponent.__init__(self, state) self.debug = LOG.isEnabledFor(logging.DEBUG) # Keep track of how many times that we accessed each node self.nodeCounts = Histogram() self.workload_segments = [ ] # Pre-split the workload into separate intervals self.splitWorkload() ## DEF def getCostImpl(self, design): """Calculate the network cost for each segment for skew analysis""" # If there is only one node, then the cost is always zero if self.state.num_nodes == 1: LOG.info("Computed Skew Cost: %f", 0.0) return 0.0 self.nodeCounts op_counts = [ 0 ] * self.state.skew_segments segment_skew = [ 0 ] * self.state.skew_segments for i in range(0, len(self.workload_segments)): # TODO: We should cache this so that we don't have to call it twice segment_skew[i], op_counts[i] = self.calculateSkew(design, self.workload_segments[i]) weighted_skew = sum([segment_skew[i] * op_counts[i] for i in xrange(len(self.workload_segments))]) cost = weighted_skew / float(sum(op_counts)) LOG.info("Computed Skew Cost: %f", cost) return cost ## DEF def calculateSkew(self, design, segment): """ Calculate the cluster skew factor for the given workload segment See Alg.#3 from Pavlo et al. 2012: http://hstore.cs.brown.edu/papers/hstore-partitioning.pdf """ if self.debug: LOG.debug("Computing skew cost for %d sessions over %d segments", \ len(segment), self.state.skew_segments) self.nodeCounts.clear() # Iterate over each session and get the list of nodes # that we estimate that each of its operations will need to touch num_ops = 0 err_ops = 0 for sess in segment: for op in sess['operations']: # Skip anything that doesn't have a design configuration if not design.hasCollection(op['collection']): if self.debug: LOG.debug("Not in design: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection']) continue if design.isRelaxed(op['collection']): if self.debug: LOG.debug("Relaxed: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection']) continue col_info = self.state.collections[op['collection']] cache = self.state.getCacheHandle(col_info) # This just returns an estimate of which nodes we expect # the op to touch. We don't know exactly which ones they will # be because auto-sharding could put shards anywhere... try: node_ids = self.state.__getNodeIds__(cache, design, op) map(self.nodeCounts.put, node_ids) num_ops += 1 except: if self.debug: LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op)) err_ops += 1 continue ## FOR (op) ## FOR (sess) if self.debug: LOG.info("Total ops %s, errors %s", num_ops, err_ops) if self.debug: LOG.debug("Node Count Histogram:\n%s", self.nodeCounts) total = self.nodeCounts.getSampleCount() if not total: return (0.0, num_ops) best = 1 / float(self.state.num_nodes) skew = 0.0 for i in xrange(self.state.num_nodes): ratio = self.nodeCounts.get(i, 0) / float(total) if ratio < best: ratio = best + ((1 - ratio/best) * (1 - best)) skew += math.log(ratio / best) return skew / (math.log(1 / best) * self.state.num_nodes), num_ops ## DEF ## ----------------------------------------------------------------------- ## WORKLOAD SEGMENTATION ## ----------------------------------------------------------------------- def splitWorkload(self): """Divide the workload up into segments for skew analysis""" start_time = None end_time = None for i in xrange(len(self.state.workload)): if start_time is None or start_time > self.state.workload[i]['start_time']: start_time = self.state.workload[i]['start_time'] if end_time is None or end_time < self.state.workload[i]['end_time']: end_time = self.state.workload[i]['end_time'] assert not start_time is None,\ "Failed to find start time in %d sessions" % len(self.state.workload) assert not end_time is None,\ "Failed to find end time in %d sessions" % len(self.state.workload) if self.debug: LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time) self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ] segment_h = Histogram() for sess in self.state.workload: idx = self.getSessionSegment(sess, start_time, end_time) segment_h.put(idx) assert idx >= 0 and idx < self.state.skew_segments,\ "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h) self.workload_segments[idx].append(sess) ## FOR ## DEF def getSessionSegment(self, sess, start_time, end_time): """Return the segment offset that the given Session should be assigned to""" timestamp = sess['start_time'] if timestamp == end_time: timestamp -= 1 ratio = (timestamp - start_time) / float(end_time - start_time) return min(self.state.skew_segments-1, int(self.state.skew_segments * ratio)) # HACK ## DEF ## CLASS
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [ ] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS