class SkewCostComponent(AbstractCostComponent):

    def __init__(self, state):
        AbstractCostComponent.__init__(self, state)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        # Keep track of how many times that we accessed each node
        self.nodeCounts = Histogram()
        self.workload_segments = [ ]

        # Pre-split the workload into separate intervals
        self.splitWorkload()
    ## DEF

    def getCostImpl(self, design):
        """Calculate the network cost for each segment for skew analysis"""

        # If there is only one node, then the cost is always zero
        if self.state.num_nodes == 1:
            LOG.info("Computed Skew Cost: %f", 0.0)
            return 0.0

        self.nodeCounts
        op_counts = [ 0 ] *  self.state.skew_segments
        segment_skew = [ 0 ] *  self.state.skew_segments
        for i in range(0, len(self.workload_segments)):
            # TODO: We should cache this so that we don't have to call it twice
            segment_skew[i], op_counts[i] = self.calculateSkew(design, self.workload_segments[i])

        weighted_skew = sum([segment_skew[i] * op_counts[i] for i in xrange(len(self.workload_segments))])
        cost = weighted_skew / float(sum(op_counts))
        LOG.info("Computed Skew Cost: %f", cost)
        return cost
    ## DEF

    def calculateSkew(self, design, segment):
        """
            Calculate the cluster skew factor for the given workload segment
            See Alg.#3 from Pavlo et al. 2012:
            http://hstore.cs.brown.edu/papers/hstore-partitioning.pdf
        """
        if self.debug:
            LOG.debug("Computing skew cost for %d sessions over %d segments", \
                      len(segment), self.state.skew_segments)

        self.nodeCounts.clear()
                      
        # Iterate over each session and get the list of nodes
        # that we estimate that each of its operations will need to touch
        num_ops = 0
        err_ops = 0
        for sess in segment:
            for op in sess['operations']:
                # Skip anything that doesn't have a design configuration
                if not design.hasCollection(op['collection']):
                    if self.debug: LOG.debug("Not in design: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection'])
                    continue
                if design.isRelaxed(op['collection']):
                    if self.debug: LOG.debug("Relaxed: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection'])
                    continue
                col_info = self.state.collections[op['collection']]
                cache = self.state.getCacheHandle(col_info)

                #  This just returns an estimate of which nodes  we expect
                #  the op to touch. We don't know exactly which ones they will
                #  be because auto-sharding could put shards anywhere...
                try: 
                    node_ids = self.state.__getNodeIds__(cache, design, op)
                    map(self.nodeCounts.put, node_ids)
                    num_ops += 1
                except:
                    if self.debug:
                        LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op))
                    err_ops += 1
                    continue
            ## FOR (op)
        ## FOR (sess)
        if self.debug: LOG.info("Total ops %s, errors %s", num_ops, err_ops)
        if self.debug: LOG.debug("Node Count Histogram:\n%s", self.nodeCounts)
        total = self.nodeCounts.getSampleCount()
        if not total:
            return (0.0, num_ops)

        best = 1 / float(self.state.num_nodes)
        skew = 0.0
        for i in xrange(self.state.num_nodes):
            ratio = self.nodeCounts.get(i, 0) / float(total)
            if ratio < best:
                ratio = best + ((1 - ratio/best) * (1 - best))
            skew += math.log(ratio / best)
        return skew / (math.log(1 / best) * self.state.num_nodes), num_ops
    ## DEF

    ## -----------------------------------------------------------------------
    ## WORKLOAD SEGMENTATION
    ## -----------------------------------------------------------------------

    def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
                
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            segment_h.put(idx)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
            self.workload_segments[idx].append(sess)
        ## FOR
    ## DEF

    def getSessionSegment(self, sess, start_time, end_time):
        """Return the segment offset that the given Session should be assigned to"""
        timestamp = sess['start_time']
        if timestamp == end_time: timestamp -= 1
        ratio = (timestamp - start_time) / float(end_time - start_time)
        return min(self.state.skew_segments-1, int(self.state.skew_segments * ratio)) # HACK
    ## DEF
## CLASS
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")
        
        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)
        
        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = [ ]
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1
        
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)
        
        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)
            
            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps
            
            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)
            
        return
    ## DEF
## CLASS
    def sessionizeWorkload(self):
        """
            Split the Sessions based on the gap between operation times
        """
        LOG.info("Sessionizing sample workload")

        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)

        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = []
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origHistogram.put(len(sess['operations']))
            origTotal += len(sess['operations'])
            sessions.append(sess)
        ## FOR
        nextSessId += 1

        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)

        # Then split them into separate sessions
        s.calculateSessions()
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)

            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                try:
                    newSess.save()
                except:
                    LOG.error("Unexpected error when saving new Session\n%s",
                              pformat(newSess))
                    raise
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                newHistogram.put(newOpCtr)
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" %
                              (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps

            # Mark the original session as deletable
            # deletable.append(sess)
            sess.delete()
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
                 avg_ops)
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)

        return

    ## DEF


## CLASS