def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
                
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            segment_h.put(idx)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
            self.workload_segments[idx].append(sess)
    def __init__(self, state):
        AbstractCostComponent.__init__(self, state)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        # Keep track of how many times that we accessed each node
        self.nodeCounts = Histogram()
        self.workload_segments = [ ]

        # Pre-split the workload into separate intervals
        self.splitWorkload()
Exemple #3
0
    def plot(self):
        reneg_bins = self.renegotiate()
        self.ranks_data = flatten(self.ranks_data)

        ref_hist = Histogram(data=self.ranks_produced_flattened,
                             nbins=self.num_pivots_sent)
        ref_hist.rebalance(self.num_bins_final)

        cur_hist = Histogram(data=self.ranks_data,
                             bin_edges=reneg_bins.bin_edges)

        fig, ax = plt.subplots()
        plot1 = ax.bar(range(32), cur_hist.hist)

        mean_load = len(self.ranks_data) / 32
        ax.plot([-1, 32], [mean_load, mean_load], color='orange', linewidth=1)
        ax.text(21, mean_load * 1.05, 'Ideal (balanced) load', color='#c04e01')

        ax.set_xlabel("Rank ID")
        ax.set_ylabel("Load")

        plt.tight_layout()
        plt.savefig("../vis/ASCR/naive_lb_2.pdf")
Exemple #4
0
 def generateCollectionHistograms(self):
     col_keys = dict([(col_name, Histogram())
                      for col_name in self.collections])
     for sess in self.workload:
         for op in sess["operations"]:
             if op["collection"].find("$cmd") != -1:
                 continue
             if not op["collection"] in col_keys:
                 LOG.warn("Missing: " + op["collection"])
                 continue
             fields = workload.getReferencedFields(op)
             h = col_keys[op["collection"]]
             for i in xrange(1, len(fields) + 1):
                 map(h.put, itertools.combinations(fields, i))
         ## FOR (op)
     ## FOR (sess)
     return (col_keys)
Exemple #5
0
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)

    LOG.info("=" * 50)

    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")

    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []

    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:
            typeCnts.put(op['type'])

        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
        vals.append(op_cnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))

    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))

    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR

    LOG.info("=" * 50)

    return
Exemple #6
0
def gen_hist(percent_particles):
    all_data_len = len(all_data)

    data_len_1p = int(all_data_len * percent_particles)

    print("Generating hist for %s, %s" %
          (all_data_len * percent_particles, percent_particles))

    sub_data = all_data[:data_len_1p]

    data_hist = Histogram(sub_data, 512)
    #  print("Sub data: ", len(sub_data))
    #  print("Data Hist: ", data_hist.bin_edges[-1],
    #  data_hist.hist[-1], sum(data_hist.hist))

    global accurate_hist
    accurate_hist, mass_per_bin = data_hist._rebalance(32)

    #  print("Accuurate Hist: ", accurate_hist[-1],
    #  mass_per_bin, sum(data_hist.hist))
    #  print(data_hist.bin_edges, data_hist.hist)
    #  print(accurate_hist)
    del data_hist
Exemple #7
0
    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug(
                "Calculating time difference for operations in %d sessions" %
                len(self.sessOps))

        # Get the full list of all the time differences
        allDiffs = []
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))

        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)

        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
            return
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5

        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)

        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    prevOpHist.put(op0["query_hash"])
                    nextOpHist.put(op1["query_hash"])
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)

        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)

        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" %
                      len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)

        self.sessionBoundaries.clear()

        # If we're doing this randomly, we want each session to have roughly
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers * 0.10) == 1 else random.randint(
                1, int(num_outliers * 0.10))
            LOG.warn(
                "Forcing %d random outliers out of %d to be chosen from workload",
                force, num_outliers)
        else:
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")

        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)

        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = []
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1

        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)

        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)

            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s",
                              pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" %
                              (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps

            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)

        return

    ## DEF


## CLASS