def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
                
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            segment_h.put(idx)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
            self.workload_segments[idx].append(sess)
    def __init__(self, state):
        AbstractCostComponent.__init__(self, state)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        # Keep track of how many times that we accessed each node
        self.nodeCounts = Histogram()
        self.workload_segments = [ ]

        # Pre-split the workload into separate intervals
        self.splitWorkload()
Exemple #3
0
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)
    
    LOG.info("="*50)
    
    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")
    
    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []
    
    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:
            typeCnts.put(op['type'])
            
        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
        vals.append(op_cnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))
        
    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))
    
    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR
    
    LOG.info("="*50)
    
    return
Exemple #4
0
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)

    LOG.info("=" * 50)

    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")

    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []

    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:
            typeCnts.put(op['type'])

        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
        vals.append(op_cnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))

    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))

    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR

    LOG.info("=" * 50)

    return
Exemple #5
0
def gen_hist(percent_particles):
    all_data_len = len(all_data)

    data_len_1p = int(all_data_len * percent_particles)

    print("Generating hist for %s, %s" %
          (all_data_len * percent_particles, percent_particles))

    sub_data = all_data[:data_len_1p]

    data_hist = Histogram(sub_data, 512)
    #  print("Sub data: ", len(sub_data))
    #  print("Data Hist: ", data_hist.bin_edges[-1],
    #  data_hist.hist[-1], sum(data_hist.hist))

    global accurate_hist
    accurate_hist, mass_per_bin = data_hist._rebalance(32)

    #  print("Accuurate Hist: ", accurate_hist[-1],
    #  mass_per_bin, sum(data_hist.hist))
    #  print(data_hist.bin_edges, data_hist.hist)
    #  print(accurate_hist)
    del data_hist
Exemple #6
0
    def plot(self):
        reneg_bins = self.renegotiate()
        self.ranks_data = flatten(self.ranks_data)

        ref_hist = Histogram(data=self.ranks_produced_flattened,
                             nbins=self.num_pivots_sent)
        ref_hist.rebalance(self.num_bins_final)

        cur_hist = Histogram(data=self.ranks_data,
                             bin_edges=reneg_bins.bin_edges)

        fig, ax = plt.subplots()
        plot1 = ax.bar(range(32), cur_hist.hist)

        mean_load = len(self.ranks_data) / 32
        ax.plot([-1, 32], [mean_load, mean_load], color='orange', linewidth=1)
        ax.text(21, mean_load * 1.05, 'Ideal (balanced) load', color='#c04e01')

        ax.set_xlabel("Rank ID")
        ax.set_ylabel("Load")

        plt.tight_layout()
        plt.savefig("../vis/ASCR/naive_lb_2.pdf")
Exemple #7
0
 def generateCollectionHistograms(self):
     col_keys = dict([(col_name, Histogram())
                      for col_name in self.collections])
     for sess in self.workload:
         for op in sess["operations"]:
             if op["collection"].find("$cmd") != -1:
                 continue
             if not op["collection"] in col_keys:
                 LOG.warn("Missing: " + op["collection"])
                 continue
             fields = workload.getReferencedFields(op)
             h = col_keys[op["collection"]]
             for i in xrange(1, len(fields) + 1):
                 map(h.put, itertools.combinations(fields, i))
         ## FOR (op)
     ## FOR (sess)
     return (col_keys)
Exemple #8
0
    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug(
                "Calculating time difference for operations in %d sessions" %
                len(self.sessOps))

        # Get the full list of all the time differences
        allDiffs = []
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))

        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)

        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
            return
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5

        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)

        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    prevOpHist.put(op0["query_hash"])
                    nextOpHist.put(op1["query_hash"])
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)

        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)

        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" %
                      len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)

        self.sessionBoundaries.clear()

        # If we're doing this randomly, we want each session to have roughly
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers * 0.10) == 1 else random.randint(
                1, int(num_outliers * 0.10))
            LOG.warn(
                "Forcing %d random outliers out of %d to be chosen from workload",
                force, num_outliers)
        else:
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
Exemple #9
0
    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug("Calculating time difference for operations in %d sessions" % len(self.sessOps))
        
        # Get the full list of all the time differences
        allDiffs = [ ]
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))
        
        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)
        
        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
            return
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5
        
        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)
        
        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    prevOpHist.put(op0["query_hash"])
                    nextOpHist.put(op1["query_hash"])
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)
        
        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)
        
        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
        
        self.sessionBoundaries.clear()
        
        # If we're doing this randomly, we want each session to have roughly 
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers*0.10) == 1 else random.randint(1, int(num_outliers*0.10))
            LOG.warn("Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers)
        else:
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")
        
        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)
        
        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = [ ]
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1
        
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)
        
        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)
            
            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps
            
            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)
            
        return
    ## DEF
## CLASS
class SkewCostComponent(AbstractCostComponent):

    def __init__(self, state):
        AbstractCostComponent.__init__(self, state)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        # Keep track of how many times that we accessed each node
        self.nodeCounts = Histogram()
        self.workload_segments = [ ]

        # Pre-split the workload into separate intervals
        self.splitWorkload()
    ## DEF

    def getCostImpl(self, design):
        """Calculate the network cost for each segment for skew analysis"""

        # If there is only one node, then the cost is always zero
        if self.state.num_nodes == 1:
            LOG.info("Computed Skew Cost: %f", 0.0)
            return 0.0

        self.nodeCounts
        op_counts = [ 0 ] *  self.state.skew_segments
        segment_skew = [ 0 ] *  self.state.skew_segments
        for i in range(0, len(self.workload_segments)):
            # TODO: We should cache this so that we don't have to call it twice
            segment_skew[i], op_counts[i] = self.calculateSkew(design, self.workload_segments[i])

        weighted_skew = sum([segment_skew[i] * op_counts[i] for i in xrange(len(self.workload_segments))])
        cost = weighted_skew / float(sum(op_counts))
        LOG.info("Computed Skew Cost: %f", cost)
        return cost
    ## DEF

    def calculateSkew(self, design, segment):
        """
            Calculate the cluster skew factor for the given workload segment
            See Alg.#3 from Pavlo et al. 2012:
            http://hstore.cs.brown.edu/papers/hstore-partitioning.pdf
        """
        if self.debug:
            LOG.debug("Computing skew cost for %d sessions over %d segments", \
                      len(segment), self.state.skew_segments)

        self.nodeCounts.clear()
                      
        # Iterate over each session and get the list of nodes
        # that we estimate that each of its operations will need to touch
        num_ops = 0
        err_ops = 0
        for sess in segment:
            for op in sess['operations']:
                # Skip anything that doesn't have a design configuration
                if not design.hasCollection(op['collection']):
                    if self.debug: LOG.debug("Not in design: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection'])
                    continue
                if design.isRelaxed(op['collection']):
                    if self.debug: LOG.debug("Relaxed: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection'])
                    continue
                col_info = self.state.collections[op['collection']]
                cache = self.state.getCacheHandle(col_info)

                #  This just returns an estimate of which nodes  we expect
                #  the op to touch. We don't know exactly which ones they will
                #  be because auto-sharding could put shards anywhere...
                try: 
                    node_ids = self.state.__getNodeIds__(cache, design, op)
                    map(self.nodeCounts.put, node_ids)
                    num_ops += 1
                except:
                    if self.debug:
                        LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op))
                    err_ops += 1
                    continue
            ## FOR (op)
        ## FOR (sess)
        if self.debug: LOG.info("Total ops %s, errors %s", num_ops, err_ops)
        if self.debug: LOG.debug("Node Count Histogram:\n%s", self.nodeCounts)
        total = self.nodeCounts.getSampleCount()
        if not total:
            return (0.0, num_ops)

        best = 1 / float(self.state.num_nodes)
        skew = 0.0
        for i in xrange(self.state.num_nodes):
            ratio = self.nodeCounts.get(i, 0) / float(total)
            if ratio < best:
                ratio = best + ((1 - ratio/best) * (1 - best))
            skew += math.log(ratio / best)
        return skew / (math.log(1 / best) * self.state.num_nodes), num_ops
    ## DEF

    ## -----------------------------------------------------------------------
    ## WORKLOAD SEGMENTATION
    ## -----------------------------------------------------------------------

    def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
                
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            segment_h.put(idx)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
            self.workload_segments[idx].append(sess)
        ## FOR
    ## DEF

    def getSessionSegment(self, sess, start_time, end_time):
        """Return the segment offset that the given Session should be assigned to"""
        timestamp = sess['start_time']
        if timestamp == end_time: timestamp -= 1
        ratio = (timestamp - start_time) / float(end_time - start_time)
        return min(self.state.skew_segments-1, int(self.state.skew_segments * ratio)) # HACK
    ## DEF
## CLASS
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")

        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)

        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = []
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1

        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)

        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)

            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s",
                              pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" %
                              (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps

            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)

        return

    ## DEF


## CLASS