def mKillShelf(self):
     ''' Declare shelf el croako and empty it of documents. '''
     lg.logInfo(
         "SHELF ", "t|%6.0f| kill storage shelf|%s| of server|%s|" %
         (G.env.now, self.ID, self.sServerID))
     self.bAlive = False
     self.mDestroyShelf()
    def cmBeforeAudit(self):
        '''
        Before each audit cycle, check to see if any servers
         have exceeded their lifetimes.
        '''
        for (sServerID, cServer) in (util.fnttSortIDDict(G.dID2Server)):
            fCurrentLife = cServer.mfGetMyCurrentLife()
            fFullLife = cServer.mfGetMyFullLife()
            fBirthday = cServer.mfGetMyBirthday()
            bServerAlive = not cServer.mbIsServerDead()
            bServerActive = cServer.bInUse

            # Log that we are examining this server, 
            #  but note if it's already dead.
            sStatus = "inuse" if bServerActive else ""
            sStatus = sStatus if bServerAlive else "dead"
            lg.logInfo("SHOCK ", "t|%6.0f| audit+end check svr|%s| "
                "life|%.0f|=|%.1f|yr %s" 
                % (G.env.now, sServerID, fFullLife, fFullLife/10000, 
                sStatus))
            NTRC.ntracef(3, "SHOK", "proc t|%6.0f| check expir? svr|%s| "
                "svrdefaulthalflife|%s| born|%s| currlife|%s|" 
                % (G.env.now, sServerID, G.fServerDefaultHalflife, 
                fBirthday, fCurrentLife))
            # Check to see if the server's lifetime has expired. 
            bDeadAlready = CShock.cmbShouldServerDieNow(sServerID)

        return G.nDeadOldServers
Esempio n. 3
0
    def cmBeforeAudit(self):
        '''
        Before each audit cycle, check to see if any servers
         have exceeded their lifetimes.
        '''
        for (sServerID, cServer) in (util.fnttSortIDDict(G.dID2Server)):
            fCurrentLife = cServer.mfGetMyCurrentLife()
            fFullLife = cServer.mfGetMyFullLife()
            fBirthday = cServer.mfGetMyBirthday()
            bServerAlive = not cServer.mbIsServerDead()
            bServerActive = cServer.bInUse

            # Log that we are examining this server,
            #  but note if it's already dead.
            sStatus = "inuse" if bServerActive else ""
            sStatus = sStatus if bServerAlive else "dead"
            lg.logInfo(
                "SHOCK ", "t|%6.0f| audit+end check svr|%s| "
                "life|%.0f|=|%.1f|yr %s" %
                (G.env.now, sServerID, fFullLife, fFullLife / 10000, sStatus))
            NTRC.ntracef(
                3, "SHOK", "proc t|%6.0f| check expir? svr|%s| "
                "svrdefaulthalflife|%s| born|%s| currlife|%s|" %
                (G.env.now, sServerID, G.fServerDefaultHalflife, fBirthday,
                 fCurrentLife))
            # Check to see if the server's lifetime has expired.
            bDeadAlready = CShock.cmbShouldServerDieNow(sServerID)

        return G.nDeadOldServers
Esempio n. 4
0
def dumpServerErrorStats():
    (TnHits,TnEmptyHits,TnAboveHiWater,TnMultipleHits) = (0,0,0,0)
    for sKey in sorted(G.dID2Shelf.keys()):
        cShelf = G.dID2Shelf[sKey]
        # Get vector of stats.
        (sID, sServerID, nQual, nHits, nEmptyHits, bAlive, nAboveHiWater, 
            nMultipleHits) = cShelf.mReportErrorStats()
        lg.logInfo("MAIN", "SERVERERR1 shelf|%s-%s| qual|%d| totalhits|%d| "
            "nonempty|%d| empty|%d| alive|%s|" 
            % (sServerID, sID, nQual, nHits, (nHits-nEmptyHits), 
            nEmptyHits,bAlive))
        lg.logInfo("MAIN", "SERVERERR2 shelf|%s-%s| qual|%d| totalhits|%d| "
            "abovehiwater|%d| multiples|%d|" 
            % (sServerID, sID, nQual, nHits, nAboveHiWater, nMultipleHits))
        TnHits          += nHits
        TnEmptyHits     += nEmptyHits
        TnAboveHiWater  += nAboveHiWater
        TnMultipleHits  += nMultipleHits
    lg.logInfo("MAIN", "SERVERERRTOTALS totalhits|%d| abovehiwater|%d| "
        "nonempty|%d| empty|%d| multiples|%d|" 
        % (TnHits, TnAboveHiWater, (TnHits-TnEmptyHits), TnEmptyHits, 
        TnMultipleHits))
    lg.logInfo("MAIN","DEADSERVERS ALL n|%d| |%s|" 
        % (len(G.lDeadServers), util.fnlSortIDList(G.lDeadServers)))
    lg.logInfo("MAIN","DEADSERVERS ACTIVE n|%d| |%s|" 
        % (len(G.lDeadActiveServers), util.fnlSortIDList(G.lDeadActiveServers)))
    return sServerID+"+"+sID
 def mRestoreSomeServerLifetimes(self):
     ''' For all the servers injured by the shock, restore life. '''
     lg.logInfo("SHOCK ", "t|%6.0f| shock end, restoring server lifetimes "
         "for ids|%s|" 
         % (G.env.now, self.lsServersShocked))
     # WARNING: list may be empty if server default life is infinite (zero).
     for sServerID in self.lsServersShocked:
         self.mRestoreSingleServerLifetime(sServerID)
     self.lsServersShocked = []
     return
Esempio n. 6
0
 def mRestoreSomeServerLifetimes(self):
     ''' For all the servers injured by the shock, restore life. '''
     lg.logInfo(
         "SHOCK ", "t|%6.0f| shock end, restoring server lifetimes "
         "for ids|%s|" % (G.env.now, self.lsServersShocked))
     # WARNING: list may be empty if server default life is infinite (zero).
     for sServerID in self.lsServersShocked:
         self.mRestoreSingleServerLifetime(sServerID)
     self.lsServersShocked = []
     return
 def mPlaceCollectionOnServer(self, mysCollID, mysServerID):
     # Send copy of collection to server.
     cServer = G.dID2Server[mysServerID]
     nDocs = cServer.mAddCollection(mysCollID, self.ID)
     # Record that this server has a copy of this collection.
     cColl = G.dID2Collection[mysCollID]
     cColl.lServerIDs.append(mysServerID)
     lg.logInfo("CLIENT", "client|%s| placed collection|%s| "
         "to server|%s|" 
         % (self.ID, mysCollID, mysServerID))
     return nDocs
 def mPlaceCollectionOnServer(self, mysCollID, mysServerID):
     # Send copy of collection to server.
     cServer = G.dID2Server[mysServerID]
     nDocs = cServer.mAddCollection(mysCollID, self.ID)
     # Record that this server has a copy of this collection.
     cColl = G.dID2Collection[mysCollID]
     cColl.lServerIDs.append(mysServerID)
     lg.logInfo("CLIENT", "client|%s| placed collection|%s| "
         "to server|%s|" 
         % (self.ID, mysCollID, mysServerID))
     return nDocs
 def cmAtEndOfRun(self):
     '''
     At end of run, check to see if any servers have exceeded
      their lifetimes.  It is possible for servers to die from
      shocks even if there is no auditing, and that counts
      because we evaluate every doc at the end of run.
     '''
     lg.logInfo("SHOCK ", "t|%6.0f| end of run checking all server lifetimes" 
         % (G.env.now))
     nResult = CShock.cmBeforeAudit()
     return
 def mCreateShelf(self):
     ''' Add a new shelf of the standard size for this Server.
         Called as needed when a doc arrives too large for available space.  
     '''
     cShelf = CShelf(self.ID, self.nQual, self.nShelfSize)
     lg.logInfo(
         "SERVER", "server |%s| created storage shelf|%s| "
         "quality|%s| size|%s|TB svrlifespan|%.0f| svrlife|%.0f|" %
         (self.ID, cShelf.ID, cShelf.nQual, self.nShelfSizeTB,
          self.mfGetMyOriginalLife(), self.mfGetMyCurrentLife()))
     return cShelf.ID
 def mShockHappens(self):
     ''' 
     Shock has happened.  Shorten server lives and schedule the 
     end of the shock cycle.
     '''
     G.nShocksTotal += 1
     G.lShockTimes.append(int(G.env.now))
     lg.logInfo("SHOCK ", "t|%6.0f| start to reduce life of |%s| servers "
         "by pct|%s|" 
         % (G.env.now, self.nSpan, self.nImpact))
     self.mReduceSomeServerLifetimes(self.nSpan, self.nImpact)
     return G.env.now
Esempio n. 12
0
 def cmAtEndOfRun(self):
     '''
     At end of run, check to see if any servers have exceeded
      their lifetimes.  It is possible for servers to die from
      shocks even if there is no auditing, and that counts
      because we evaluate every doc at the end of run.
     '''
     lg.logInfo(
         "SHOCK ",
         "t|%6.0f| end of run checking all server lifetimes" % (G.env.now))
     nResult = CShock.cmBeforeAudit()
     return
Esempio n. 13
0
def dumpServerUseStats():
    for sKey in sorted(G.dID2Shelf.keys()):
        cShelf = G.dID2Shelf[sKey]
        # Get vector of stats from shelf.
        (sID,sServerID,nQual,fExpolife,nCapacity,nHiWater,nCurrentUse) = \
            cShelf.mReportUseStats()
        lg.logInfo("MAIN", "SERVERUSE shelf|%s-%s| qual|%d| "
            "sectorexpolife|%.0f| size|%d| hiwater|%d| currentuse|%d| "
            "full%%|%d|" 
            % (sServerID, sID, nQual, fExpolife, nCapacity, nHiWater, 
            nCurrentUse, 100*nCurrentUse/nCapacity))
    return sServerID+"+"+sID
    def mScheduleGlitch(self):
        '''Wait for a glitch lifetime on this shelf.
        If the shelf died as a result of the glitch, stop
        rescheduling.  
        '''
        fNow = G.env.now
        NTRC.tracef(
            3, "LIFE", "proc schedule glitch t|%d| shelf|%s| alive|%s|" %
            (fNow, self.sShelfID, self.cShelf.mbIsShelfAlive()))
        while 1:
            fNow = G.env.now
            bAlive = self.cShelf.mbIsShelfAlive()
            if bAlive:
                self.fShelfLife = self.mfCalcCurrentGlitchLifetime(fNow)
                if self.fShelfLife > 0 and bAlive:
                    self.fShelfInterval = util.makeexpo(self.fShelfLife)
                    lg.logInfo(
                        "LIFETIME", "schedule  t|%6.0f| for shelf|%s| "
                        "interval|%.3f| freq|%d| life|%.3f|" %
                        (fNow, self.sShelfID, self.fShelfInterval,
                         self.nGlitchFreq, self.fShelfLife))
                    NTRC.tracef(
                        3, "LIFE", "proc schedule glitch shelf|%s| "
                        "interval|%.3f| based on life|%.3f| alive|%s| "
                        "waiting..." % (self.sShelfID, self.fShelfInterval,
                                        self.fShelfLife, bAlive))
                    yield G.env.timeout(self.fShelfInterval)

                    # ****** Glitch has now occurred. ******
                    # If correlated failure, step entirely outside the
                    #  Lifetime-Shelf-Server context to signal several servers.
                    if self.nGlitchSpan > 1:
                        from server import CServer
                        CServer.fnCorrFailHappensToAll(self.nGlitchSpan)
                    else:
                        self.mGlitchHappensNow()
                else:
                    NTRC.ntracef(
                        3, "LIFE", "proc glitch no freq or not alive, "
                        "set wait to infinity shelf|%s| freq|%d| life|%.3f| "
                        "interval|%.3f|" %
                        (self.sShelfID, self.nGlitchFreq, self.fShelfLife,
                         self.fShelfInterval))
                    yield G.env.timeout(G.fInfinity)
            else:
                break  # Because we have to use fako "while 1".
        # When shelf is not alive anymore, wait forever
        NTRC.ntracef(
            3, "LIFE", "proc glitch shelf no longer alive, set wait "
            "to infinity shelf|%s| freq|%d| life|%.3f| interval|%.3f|" %
            (self.sShelfID, self.nGlitchFreq, self.fShelfLife,
             self.fShelfInterval))
        yield G.env.timeout(G.fInfinity)
Esempio n. 15
0
 def mShockHappens(self):
     ''' 
     Shock has happened.  Shorten server lives and schedule the 
     end of the shock cycle.
     '''
     G.nShocksTotal += 1
     G.lShockTimes.append(int(G.env.now))
     lg.logInfo(
         "SHOCK ", "t|%6.0f| start to reduce life of |%s| servers "
         "by pct|%s|" % (G.env.now, self.nSpan, self.nImpact))
     self.mReduceSomeServerLifetimes(self.nSpan, self.nImpact)
     return G.env.now
def fnTimerInt(objTimer, xContext):
    '''\
    Server life-span timer was interrupted to reschedule it, 
    probably by a shock, and presumably to a shorter life.
    But the server is still alive.
    '''
    NTRC.trace(
        3, "interrupt %s delay %s called from %s at %s." %
        (xContext, objTimer.delay, objTimer, G.env.now))
    lg.logInfo(
        "SERVER", "interrupted t|%6.0f| context|%s| delay|%s|" %
        (G.env.now, xContext, objTimer.delay))
    return (objTimer, xContext)
Esempio n. 17
0
def dumpGlitchStats():
    
    for sKey in sorted(G.dID2Lifetime.keys()):
        cLifetime = G.dID2Lifetime[sKey]
        dStats = cLifetime.mReportGlitchStats()
        lg.logInfo("MAIN", "LIFETIME shelf|%s| lifetime|%s| freq|%s| "
            "impact|%s| decay|%s| maxlife|%s| count|%s| time|%.3f|" 
        % (
        dStats["sShelfID"], dStats["sLifetimeID"], 
        dStats["nGlitchFreq"], dStats["nImpactReductionPct"], 
        dStats["nGlitchDecayHalflife"], dStats["nGlitchMaxlife"], 
        dStats["nGlitches"], dStats["fGlitchTime"]))
        
    lg.logInfo("MAIN","LIFETIME Total glitches|%d|" % (G.nGlitchesTotal))
 def mGlitchHappensNow(self):
     """Start a glitch happening right now.
     May be invoked from outside a CLifetime instance as well as 
     from inside."""
     fNow = G.env.now
     NTRC.ntracef(
         3, "LIFE", "proc glitch wait expired t|%6.0f| "
         "for shelf|%s| freq|%d| life|%.3f| interval|%.3f|" %
         (fNow, self.sShelfID, self.nGlitchFreq, self.fShelfLife,
          self.fShelfInterval))
     self.mGlitchHappens(fNow)
     lg.logInfo(
         "LIFETIME", "glitchnow t|%6.0f| for shelf|%s| active|%s|" %
         (fNow, self.sShelfID, self.bGlitchActive))
 def mDestroyShelf(self):
     ''' Nuke all the copies on the shelf.  
         Can't delete the CShelf object, however.
     '''
     NTRC.ntracef(3, "SHLF", "proc mDestroyShelf1 shelf|%s| "
         "has ncopies|%s|" 
         % (self.ID, len(self.lCopyIDs)))
     lg.logInfo("SHELF ", "t|%6.0f| destroy shelf|%s| "
         "of svr|%s| ncopies|%s|" 
         % (G.env.now, self.ID, self.sServerID, 
         len(self.lCopyIDs)))
     lAllCopyIDs = self.lCopyIDs[:]  # DANGER: list modified inside loop, 
                                     #  requires deepcopy.
     for sCopyID in lAllCopyIDs:
             self.mDestroyCopy(sCopyID)
 def mDestroyShelf(self):
     ''' Nuke all the copies on the shelf.  
         Can't delete the CShelf object, however.
     '''
     NTRC.ntracef(
         3, "SHLF", "proc mDestroyShelf1 shelf|%s| "
         "has ncopies|%s|" % (self.ID, len(self.lCopyIDs)))
     lg.logInfo(
         "SHELF ", "t|%6.0f| destroy shelf|%s| "
         "of svr|%s| ncopies|%s|" %
         (G.env.now, self.ID, self.sServerID, len(self.lCopyIDs)))
     lAllCopyIDs = self.lCopyIDs[:]  # DANGER: list modified inside loop,
     #  requires deepcopy.
     for sCopyID in lAllCopyIDs:
         self.mDestroyCopy(sCopyID)
    def mRestoreSingleServerLifetime(self, mysServerID):
        ''' Restore normal lifetime to a single server. '''
        bDeadAlready = CShock.cmbShouldServerDieNow(mysServerID)

        cServer = G.dID2Server[mysServerID]
        if cServer.mbIsServerDead() or bDeadAlready:
            lg.logInfo("SHOCK ", "t|%6.0f| cannot restore dead server|%s| life" 
                % (G.env.now, mysServerID))
        else:
            fOriginalLifespan = cServer.mfGetMyOriginalLife()
            lg.logInfo("SHOCK ", "t|%6.0f| restoring server|%s| life to |%.0f|" 
                % (G.env.now, mysServerID, fOriginalLifespan))
            cServer.mRescheduleMyLife(fOriginalLifespan)
        cServer.mSetServerInShock(False)
        return mysServerID
Esempio n. 22
0
    def mAuditCycle(self,mynCycleInterval,mynSegments):
        '''\
        SimPy generator to schedule audit cycles for this collection.
        Starts an async process that ticks every 
        audit cycle forever.
        '''
        # Initially, wait for some small random interval
        # so that client audit cycles are not synchronized,
        # like Ethernet collision retry waits. 
        nRandTime = util.makeunif(0,mynCycleInterval/20)
# Nope, not any more.  No need for the random offset since there is
#  only one auditor.
#        yield G.env.timeout(nRandTime)
        # And now wait for one segment interval before starting the first seg.
        #  Seems odd, but consider an annual audit in quarterly segments:
        #  you don't want to wait a whole year before starting quarterly audits; 
        #  start after the first quarter.  
        nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, 
            mynSegments)
        yield G.env.timeout(nSegmentInterval)
        
        while True:
            lg.logInfo("AUDIT2", "begin cycle t|%10.3f| auditid|%s| type|%s| "
                "cycle|%s| cli|%s| coll|%s| interval|%s| nsegments|%s|" 
                % (G.env.now, self.ID, self.TYPE, self.nNumberOfCycles, 
                self.sClientID, self.sCollectionID, mynCycleInterval, 
                mynSegments))
            
            # Start the collection audit and wait for it to finish.
            tCycleStartTime = G.env.now
            self.nRepairsThisCycle = 0
            eSyncEvent = G.env.event()
            G.env.process(
                self.mAuditCollection(mynCycleInterval, G.nAuditSegments, 
                self.sCollectionID, eSyncEvent))
            yield eSyncEvent

            lg.logInfo("AUDIT2", "end cycle   t|%10.3f| auditid|%s| cycle|%s| "
                "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| "
                "majority|%s| minority|%d|" 
                % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, 
                self.sCollectionID, self.nRepairsThisCycle, 
                self.nRepairsTotal, self.nPermanentLosses, 
                self.nRepairsMajority, self.nRepairsMinority))

            self.nNumberOfCycles += 1
            tNextCycleStartTime = tCycleStartTime + mynCycleInterval
            yield G.env.timeout(tNextCycleStartTime - G.env.now)
    def mAuditCycle(self,mynCycleInterval,mynSegments):
        '''\
        SimPy generator to schedule audit cycles for this collection.
        Starts an async process that ticks every 
        audit cycle forever.
        '''
        # Initially, wait for some small random interval
        # so that client audit cycles are not synchronized,
        # like Ethernet collision retry waits. 
        nRandTime = util.makeunif(0,mynCycleInterval/20)
# Nope, not any more.  No need for the random offset since there is
#  only one auditor.
#        yield G.env.timeout(nRandTime)
        # And now wait for one segment interval before starting the first seg.
        #  Seems odd, but consider an annual audit in quarterly segments:
        #  you don't want to wait a whole year before starting quarterly audits; 
        #  start after the first quarter.  
        nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, 
            mynSegments)
        yield G.env.timeout(nSegmentInterval)
        
        while True:
            lg.logInfo("AUDIT2", "begin cycle t|%10.3f| auditid|%s| type|%s| "
                "cycle|%s| cli|%s| coll|%s| interval|%s| nsegments|%s|" 
                % (G.env.now, self.ID, self.TYPE, self.nNumberOfCycles, 
                self.sClientID, self.sCollectionID, mynCycleInterval, 
                mynSegments))
            
            # Start the collection audit and wait for it to finish.
            tCycleStartTime = G.env.now
            self.nRepairsThisCycle = 0
            eSyncEvent = G.env.event()
            G.env.process(
                self.mAuditCollection(mynCycleInterval, G.nAuditSegments, 
                self.sCollectionID, eSyncEvent))
            yield eSyncEvent

            lg.logInfo("AUDIT2", "end cycle   t|%10.3f| auditid|%s| cycle|%s| "
                "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| "
                "majority|%s| minority|%d|" 
                % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, 
                self.sCollectionID, self.nRepairsThisCycle, 
                self.nRepairsTotal, self.nPermanentLosses, 
                self.nRepairsMajority, self.nRepairsMinority))

            self.nNumberOfCycles += 1
            tNextCycleStartTime = tCycleStartTime + mynCycleInterval
            yield G.env.timeout(tNextCycleStartTime - G.env.now)
def fnTimerCall(objTimer, xContext):
    '''\
    Server life-span timer has completed, and the server must die.
    Set the timer event to release any process waiting for it.
    Declare the server to be el croako.  
    '''
    NTRC.trace(
        3, "callback %s delay %s called from %s at %s." %
        (xContext, objTimer.delay, objTimer, G.env.now))
    objTimer.setevent()
    cServer = xContext[0]
    cServer.mKillServer()
    lg.logInfo(
        "SERVER", "timercalled t|%6.0f| context|%s| delay|%s|" %
        (G.env.now, xContext, objTimer.delay))
    return (objTimer, xContext)
Esempio n. 25
0
def dumpCollectionStats(mysCollID):
    cColl = G.dID2Collection[mysCollID]
    dStats = cColl.mdReportCollectionStats()

    (sCollIDx,sClientIDx,nServers,nDocs, nDocsOkay, nDocsInjured, 
        nDocsForensics, nDocsLost) = \
        (mysCollID, 
        dStats["sClientID"], dStats["nServers"], dStats["nDocs"], 
        dStats["nOkay"], dStats["nRepairsMajority"], dStats["nRepairsMinority"], 
        dStats["nLost"])

    lg.logInfo("MAIN", "COLLECTIONTOTALS client|%s| collection|%s| "
        "nservers|%s| ndocs|%s| nokay|%s| nmajority|%s| nminority|%s| "
        "nlost|%s| "
        % (sClientIDx, sCollIDx, nServers, nDocs, nDocsOkay, nDocsInjured, 
        nDocsForensics, nDocsLost))
Esempio n. 26
0
    def mRestoreSingleServerLifetime(self, mysServerID):
        ''' Restore normal lifetime to a single server. '''
        bDeadAlready = CShock.cmbShouldServerDieNow(mysServerID)

        cServer = G.dID2Server[mysServerID]
        if cServer.mbIsServerDead() or bDeadAlready:
            lg.logInfo(
                "SHOCK ", "t|%6.0f| cannot restore dead server|%s| life" %
                (G.env.now, mysServerID))
        else:
            fOriginalLifespan = cServer.mfGetMyOriginalLife()
            lg.logInfo(
                "SHOCK ", "t|%6.0f| restoring server|%s| life to |%.0f|" %
                (G.env.now, mysServerID, fOriginalLifespan))
            cServer.mRescheduleMyLife(fOriginalLifespan)
        cServer.mSetServerInShock(False)
        return mysServerID
 def cmbShouldServerDieNow(self, mysServerID):
     ''' 
     If the server's (possibly reduced) lifetime has expired, 
      kill it rather than restoring it to a full life.
     '''
     cServer = G.dID2Server[mysServerID]
     fCurrentLife = cServer.mfGetMyCurrentLife()
     fFullLife = cServer.mfGetMyFullLife()
     fBirthday = cServer.mfGetMyBirthday()
     bServerAlive = not cServer.mbIsServerDead()
     if (G.fServerDefaultHalflife > 0
         and fCurrentLife > 0
         and fFullLife <= G.env.now
         and bServerAlive
         ):
         # Server has overstayed its welcome.  Kill it.  
         sInUse = "currently in use" if cServer.mbIsServerInUse() else ""
         sShockVictim = "shock victim" if cServer.mbIsServerInShock() else ""
         lg.logInfo("SHOCK ", "t|%6.0f| kill svr|%s| "
             "born|%.0f| life|%.0f|=|%.1f|yr "
             "expired %s %s" 
             % (G.env.now, mysServerID, fBirthday, 
             fCurrentLife, fCurrentLife/10000, 
             sInUse, sShockVictim))
         NTRC.ntracef(3, "SHOK", "proc t|%6.0f| expired svr|%s| "
             "svrdefaulthalflife|%s| born|%.0f| currlife|%.0f|" 
             % (G.env.now, mysServerID, G.fServerDefaultHalflife, 
             fBirthday, fCurrentLife))
         result = cServer.mKillServer()
         G.nDeadOldServers += 1
         bResult = True
         # Now check to see if the server died because of the shock.
         #  Is the current life less than the original life?
         # Philosophical question: if the shock type 2 caused your new, 
         #  recalculated life to be longer than your original life, 
         #  can your death reasonably be attributed to the shock?
         #  Answer = no, because without the shock you would have
         #  died even earlier.  Tricky, though.  
         fOriginalLife = cServer.mfGetMyOriginalLife()
         if fCurrentLife < fOriginalLife:
             G.nDeathsDueToShock += 1
             G.lDeathsDueToShock.append(mysServerID)
     else:
         bResult = False
     return bResult
def makeServers(mydServers):
    for sServerName in mydServers:
        (nServerQual,nShelfSize) = mydServers[sServerName][0]
        cServer = server.CServer(sServerName,nServerQual,nShelfSize)
        sServerID = cServer.ID
        G.lAllServers.append(cServer)
        fCurrentLife = cServer.mfGetMyCurrentLife()
        lg.logInfo("MAIN","created server|%s| quality|%s| shelfsize|%s|TB "
            "name|%s| life|%.0f|" 
            % (sServerID, nServerQual, nShelfSize, sServerName, fCurrentLife))
        # Invert the server list so that clients can look up 
        # all the servers that satisfy a quality criterion.  
        if nServerQual in G.dQual2Servers:
            G.dQual2Servers[nServerQual].append([sServerName,sServerID])
        else:
            G.dQual2Servers[nServerQual] = [[sServerName,sServerID]]
        NTRC.ntracef(5,"SVRS","proc makeServers dQual2Servers qual|%s| servers|%s|" % (nServerQual,G.dQual2Servers[nServerQual]))
    return G.dQual2Servers
Esempio n. 29
0
 def cmbShouldServerDieNow(self, mysServerID):
     ''' 
     If the server's (possibly reduced) lifetime has expired, 
      kill it rather than restoring it to a full life.
     '''
     cServer = G.dID2Server[mysServerID]
     fCurrentLife = cServer.mfGetMyCurrentLife()
     fFullLife = cServer.mfGetMyFullLife()
     fBirthday = cServer.mfGetMyBirthday()
     bServerAlive = not cServer.mbIsServerDead()
     if (G.fServerDefaultHalflife > 0 and fCurrentLife > 0
             and fFullLife <= G.env.now and bServerAlive):
         # Server has overstayed its welcome.  Kill it.
         sInUse = "currently in use" if cServer.mbIsServerInUse() else ""
         sShockVictim = "shock victim" if cServer.mbIsServerInShock(
         ) else ""
         lg.logInfo(
             "SHOCK ", "t|%6.0f| kill svr|%s| "
             "born|%.0f| life|%.0f|=|%.1f|yr "
             "expired %s %s" %
             (G.env.now, mysServerID, fBirthday, fCurrentLife,
              fCurrentLife / 10000, sInUse, sShockVictim))
         NTRC.ntracef(
             3, "SHOK", "proc t|%6.0f| expired svr|%s| "
             "svrdefaulthalflife|%s| born|%.0f| currlife|%.0f|" %
             (G.env.now, mysServerID, G.fServerDefaultHalflife, fBirthday,
              fCurrentLife))
         result = cServer.mKillServer()
         G.nDeadOldServers += 1
         bResult = True
         # Now check to see if the server died because of the shock.
         #  Is the current life less than the original life?
         # Philosophical question: if the shock type 2 caused your new,
         #  recalculated life to be longer than your original life,
         #  can your death reasonably be attributed to the shock?
         #  Answer = no, because without the shock you would have
         #  died even earlier.  Tricky, though.
         fOriginalLife = cServer.mfGetMyOriginalLife()
         if fCurrentLife < fOriginalLife:
             G.nDeathsDueToShock += 1
             G.lDeathsDueToShock.append(mysServerID)
     else:
         bResult = False
     return bResult
Esempio n. 30
0
def dumpAuditStats():
    (TnNumberOfCycles, TnRepairsTotal, TnPermanentLosses, TnRepairsMajority,
        TnRepairsMinority) = (0,0,0,0,0)
    if G.nAuditCycleInterval:       # If there is any auditing in this run,...
        for sKey in sorted(G.dID2Audit.keys()):
            cAudit = G.dID2Audit[sKey]
            # Get vector of stats for one Audit instance.
            dStats = cAudit.mdReportAuditStats()
            (ID,sClientID,sCollectionID,sServerID
             ,nNumberOfCycles,nRepairsTotal
             ,nPermanentLosses,nRepairsMajority,nRepairsMinority) \
            = \
            (sKey,dStats["sClientID"],dStats["sCollectionID"],"*"
             ,dStats["nNumberOfCycles"],dStats["nRepairsTotal"]
             ,dStats["nPermanentLosses"],dStats["nRepairsMajority"]
             ,dStats["nRepairsMinority"]) 
            (nFrequency,nSegments) = (dStats["nFrequency"],dStats["nSegments"])
            lg.logInfo("MAIN", "AUDITS id|%s| client|%s| coll|%s| server|%s| "
                "ncycles|%s| nrepairs|%s| nlosses|%s| nmajority|%s| "
                "nminority|%s|" 
                % (ID, sClientID, sCollectionID, sServerID, nNumberOfCycles, 
                nRepairsTotal, nPermanentLosses, nRepairsMajority, 
                nRepairsMinority))
    
            # Accumulate totals.
            TnNumberOfCycles    +=  nNumberOfCycles
            TnRepairsTotal      +=  nRepairsTotal
            TnPermanentLosses   +=  nPermanentLosses
            TnRepairsMajority   +=  nRepairsMajority
            TnRepairsMinority   +=  nRepairsMinority
            # A couple of these are just declarations, not to be totalled.  
            TnFrequency         =   nFrequency
            TnSegments          =   nSegments

    else:                           # If no auditing in this run.
        TnNumberOfCycles = TnRepairsTotal = 0
        TnPermanentLosses = TnRepairsMajority = TnRepairsMinority = 0
        TnFrequency = TnSegments = 0

    lg.logInfo("MAIN", "AUDITTOTALS ncycles|%s| nfrequency|%s| nsegments|%s| "
        "nrepairs|%s| nmajority|%s| nminority|%s| nlost|%s| " 
        % (TnNumberOfCycles, TnFrequency, TnSegments, TnRepairsTotal, 
        TnRepairsMajority, TnRepairsMinority, TnPermanentLosses))
    return 
 def fnsInventNewServer(cls):
     '''Class method: Create another server on the fly.
     Use the info from some old one that is still alive to create
     a new one.  Change the long name to make it unique.  
     Return the new server ID.
     '''
     tnow = datetime.now()
     lLiveServerIDs = cls.fnlListLiveServerIDs()
     sServerID = lLiveServerIDs[0]
     cServer = G.dID2Server[sServerID]
     sNewName = (cServer.sName + "_" + util.fnsGetTimeStamp() + "_" +
                 tnow.strftime("%H%M%S.%f"))
     cNewServer = CServer(sNewName, cServer.nQual, cServer.nShelfSizeTB)
     lg.logInfo(
         "SERVER", "created new server|%s| name|%s| "
         "quality|%s| size|%s|TB svrlife|%.0f|" %
         (cNewServer.ID, sNewName, cNewServer.nQual,
          cNewServer.nShelfSizeTB, cNewServer.mfGetMyCurrentLife()))
     return cNewServer.ID
    def mAge_shelf(self, mynLifeParam):
        ''' An entire shelf fails.  Remove all the docs it contained.
            Eventually, this will trigger a repair event and make the 
            collection more vulnerable during the repair.  
        '''
        fShelfLife = util.makeexpo(mynLifeParam)
        lg.logInfo(
            "SERVER", "mAge_shelf set lifetime time|%6.0f| shelf|%s| "
            "next lifetime|%.3f|khr" % (G.env.now, self.ID, fShelfLife))
        NTRC.tracef(
            3, "SHLF", "proc mAge_shelf  time|%6.0f| shelf|%s| "
            "next lifetime|%.3f|khr" % (G.env.now, self.ID, fShelfLife))
        yield G.env.timeout(fShelfLife)

        # S H E L F  F A I L S
        G.nTimeLastEvent = G.env.now
        self.bAlive = False  # Shelf can no longer be used to store docs.
        NTRC.tracef(
            3, "SHLF", "proc mAge_shelf  time|%d| shelf|%s| shelf_error" %
            (G.env.now, self.ID))
        lg.logInfo(
            "SERVER", "storage shelf failed time|%6.0f| server|%s| "
            "shelf|%s| lost |%d| docs" %
            (G.env.now, self.sServerID, self.ID, len(self.lCopyIDs)))
        # This whole shelf is a goner.  Kill it.
        NTRC.tracef(
            5, "SHLF", "proc mAge_shelf kill contents ldocs|%s| "
            "lcopies|%s|" % (self.lDocIDs, self.lCopyIDs))
        # Note that we have to copy the list before modifying it and
        # iterate over the copy of the list.
        # Standard problem with updating an iterable inside the for loop.
        templCopyIDs = copy.deepcopy(self.lCopyIDs)
        for sCopyID in templCopyIDs:
            sDocID = G.dID2Copy[sCopyID].sDocID
            self.mDestroyCopy(sCopyID)
            #            G.dID2Server[self.sServerID].mDestroyDocument(sDocID,self.ID)
            G.dID2Server[self.sServerID].mDestroyCopy(sCopyID, sDocId, self.ID)
            self.mReportDocumentLost(sDocID)
        NTRC.tracef(
            3, "FAIL", "proc t|%d| shelf failure server|%s| qual|%d| "
            "shelf|%s| docs|%d|" %
            (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual,
             self.ID, len(templCopyIDs)))
Esempio n. 33
0
    def mAuditCollection(self, mynCycleInterval, mynSegments, mysCollectionID, 
            myeCallerSyncEvent):
        '''\
        SimPy generator to audit an entire collection.
        Divide the collection into segments and schedule audits
        for each segment in turn.
        '''
        fTimeCycleBegin = G.env.now
        lg.logInfo("AUDIT2","begin colln t|%10.3f| auditid|%s| cycle|%s| cli|%s| coll|%s|" % (G.env.now,self.ID,self.nNumberOfCycles,self.sClientID,self.sCollectionID))

        for iThisSegment in range(mynSegments):
            tSegmentStartTime = G.env.now
            nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, 
                mynSegments)
            bLastSegment = (iThisSegment == mynSegments-1)

            self.lDocsThisSegment = self.mIdentifySegment(mysCollectionID, 
                mynSegments, iThisSegment)
            eSyncEvent = G.env.event()
            G.env.process(
                self.mAuditSegment(iThisSegment, self.lDocsThisSegment, 
                mysCollectionID, eSyncEvent))
            # Wait for completion of segment and its allotted time.
            yield eSyncEvent
            tNextSegmentStartTime = tSegmentStartTime + nSegmentInterval
            NTRC.tracef(3, "AUD2", "proc AuditCollection1 now|%s| tstart|%s| "
                "tnext|%s| tinterval|%s| blastseg|%s|" 
                % (G.env.now, tSegmentStartTime, tNextSegmentStartTime, 
                nSegmentInterval, bLastSegment))
            yield G.env.timeout(tNextSegmentStartTime - G.env.now)
        
        fTimeCycleEnd = G.env.now
        self.fTimeCycleLength = fTimeCycleEnd - fTimeCycleBegin
        lg.logInfo("AUDIT2", "end colln   t|%10.3f| auditid|%s| cycle|%s| "
            "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| "
            "majority|%s| minority|%d| duration|%9.3f|" 
            % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, 
            self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, 
            self.nPermanentLosses, self.nRepairsMajority, 
            self.nRepairsMinority, self.fTimeCycleLength))
        # Tell the caller that we finished.
        myeCallerSyncEvent.succeed(value=self.nNumberOfCycles)
    def mAuditCollection(self, mynCycleInterval, mynSegments, mysCollectionID, 
            myeCallerSyncEvent):
        '''\
        SimPy generator to audit an entire collection.
        Divide the collection into segments and schedule audits
        for each segment in turn.
        '''
        fTimeCycleBegin = G.env.now
        lg.logInfo("AUDIT2","begin colln t|%10.3f| auditid|%s| cycle|%s| cli|%s| coll|%s|" % (G.env.now,self.ID,self.nNumberOfCycles,self.sClientID,self.sCollectionID))

        for iThisSegment in range(mynSegments):
            tSegmentStartTime = G.env.now
            nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, 
                mynSegments)
            bLastSegment = (iThisSegment == mynSegments-1)

            self.lDocsThisSegment = self.mIdentifySegment(mysCollectionID, 
                mynSegments, iThisSegment)
            eSyncEvent = G.env.event()
            G.env.process(
                self.mAuditSegment(iThisSegment, self.lDocsThisSegment, 
                mysCollectionID, eSyncEvent))
            # Wait for completion of segment and its allotted time.
            yield eSyncEvent
            tNextSegmentStartTime = tSegmentStartTime + nSegmentInterval
            NTRC.tracef(3, "AUD2", "proc AuditCollection1 now|%s| tstart|%s| "
                "tnext|%s| tinterval|%s| blastseg|%s|" 
                % (G.env.now, tSegmentStartTime, tNextSegmentStartTime, 
                nSegmentInterval, bLastSegment))
            yield G.env.timeout(tNextSegmentStartTime - G.env.now)
        
        fTimeCycleEnd = G.env.now
        self.fTimeCycleLength = fTimeCycleEnd - fTimeCycleBegin
        lg.logInfo("AUDIT2", "end colln   t|%10.3f| auditid|%s| cycle|%s| "
            "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| "
            "majority|%s| minority|%d| duration|%9.3f|" 
            % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, 
            self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, 
            self.nPermanentLosses, self.nRepairsMajority, 
            self.nRepairsMinority, self.fTimeCycleLength))
        # Tell the caller that we finished.
        myeCallerSyncEvent.succeed(value=self.nNumberOfCycles)
 def mReduceSomeServerLifetimes(self, mynSpan, mynImpact):
     ''' 
     Find a shockspan-wide subset of servers and reduce their
     expected lifetimes by the stated reduction percentage.
     '''
     lServersToShock = server.CServer.fnlSelectServerVictims(mynSpan)
     fReduction = mynImpact * 1.0 / 100.0 
     NTRC.ntracef(3, "SHOK", "proc reduce servers|%s| by|%s|" 
         % (lServersToShock, fReduction))
     for sServerID in lServersToShock:
         lg.logInfo("SHOCK ", "t|%6.0f| reduce svr|%s| life by pct|%s|" 
             % (G.env.now, sServerID, self.nImpact))
         cServer = G.dID2Server[sServerID]
         fOriginalLife = float(cServer.mfGetMyOriginalLife())
         if fOriginalLife > 0:
             self.mReduceSingleServerLifetime(sServerID, fReduction)
             self.lsServersShocked.append(sServerID)
         else:
             lg.logInfo("SHOCK ", "t|%6.0f| cannot reduce svr|%s| life|%.0f|"
                 % (G.env.now, sServerID, fOriginalLife)) 
Esempio n. 36
0
 def mTestCollection(self):
     ''' Return a list, maybe empty, of documents declared missing
         from this collection.  
     '''
     bOldLogState = G.bDoNotLogInfo
     if G.bShortLog: G.bDoNotLogInfo = True
     lDeadDocIDs = list()
     for sDocID in self.lDocIDs:
         cDoc = G.dID2Document[sDocID]
         (bOkay, bInjured, bForensics, bLost) = cDoc.mTestCopies()
         NTRC.ntracef(
             3, "COLL", "proc TestColl1 coll|%s| tests doc|%s| "
             "okay|%s| injured|%s| forensics|%s| lost|%s|" %
             (self.ID, sDocID, bOkay, bInjured, bForensics, bLost))
         # Merge new info with old info from audits.
         (bOkay,bInjured,bForensics,bLost) = \
             cDoc.mMergeEvaluation(bOkay,bInjured,bForensics,bLost)
         # Update stats of document statuses.
         self.nDocsOkay += 1 if bOkay else 0
         self.nDocsMajorityRepair += 1 if bInjured else 0
         self.nDocsMinorityRepair += 1 if bForensics else 0
         self.nDocsLost += 1 if bLost else 0
         # Update lost list.
         if bLost:
             lDeadDocIDs.append(sDocID)
             NTRC.ntracef(
                 3, "COLL", "proc TestColl2 dead doc|%s| in coll|%s| " %
                 (sDocID, self.ID))
         NTRC.ntracef(
             3, "COLL", "proc TestColl3 coll|%s| doc|%s| okay|%s| "
             "majority|%s| minority|%s| lost|%s|" %
             (self.ID, sDocID, bOkay, bInjured, bForensics, bLost))
         if not bOkay:
             (nMajority, nMinority) = cDoc.mGetRepairCounts()
             lg.logInfo(
                 "DOCUMENT", "doc injured cli|%s| coll|%s| doc|%s| "
                 "majority|%s|%s| minority|%s|%s| lost|%s|" %
                 (self.sClientID, self.ID, sDocID, bInjured, nMajority,
                  bForensics, nMinority, bLost))
     G.bDoNotLogInfo = bOldLogState
     return lDeadDocIDs
 def mGlitchHappens(self, myfNow):
     self.bGlitchActive = True
     self.nGlitches += 1
     G.nGlitchesTotal += 1
     lg.logInfo(
         "LIFETIME", "glitch    t|%6.0f|  on shelf|%s| num|%s| "
         "impactpct|%d| decayhalflife|%d| span|%d| maxlife|%d| gtotal|%s|" %
         (myfNow, self.sShelfID, self.nGlitches, self.nImpactReductionPct,
          self.nGlitchDecayHalflife, self.nGlitchSpan, self.nGlitchMaxlife,
          G.nGlitchesTotal))
     self.fGlitchBegin = float(G.env.now)
     NTRC.tracef(
         3, "LIFE", "proc happens1 t|%.3f| shelf|%s| num|%s| impact|%d| "
         "decayhalflife|%d| span|%d| maxlife|%d|" %
         (myfNow, self.sShelfID, self.nGlitches, self.nImpactReductionPct,
          self.nGlitchDecayHalflife, self.nGlitchSpan, self.nGlitchMaxlife))
     ''' If this is a 100% glitch:
         - Declare server, not just shelf, to be dead.
         - Auditor will eventually discover the problem and 
            call client to inform that server is dead.  
     '''
     sServerID = self.cShelf.sServerID
     if G.dID2Server[sServerID].bDead or self.nImpactReductionPct == 100:
         self.cShelf.bAlive = False
         #sServerID = self.cShelf.sServerID
         cServer = G.dID2Server[sServerID]
         NTRC.ntracef(
             3, "LIFE", "proc happens2 glitch 100pct or server dead "
             "id|%s| shelf|%s| svr|%s|" %
             (self.ID, self.cShelf.ID, sServerID))
         cServer.mServerDies()
         NTRC.ntracef(
             3, "LIFE", "proc happens3 life|%s| killed server |%s|" %
             (self.ID, sServerID))
         lg.logInfo(
             "LIFETIME", "100pct glitch on shelf |%s| "
             "of server|%s| - all docs lost" % (self.sShelfID, sServerID))
     else:
         self.mInjectError(self.nImpactReductionPct,
                           self.nGlitchDecayHalflife, self.nGlitchMaxlife)
     return (self.nGlitches, self.sShelfID)
def makeServers(mydServers):
    for sServerName in mydServers:
        (nServerQual, nShelfSize) = mydServers[sServerName][0]
        cServer = server.CServer(sServerName, nServerQual, nShelfSize)
        sServerID = cServer.ID
        G.lAllServers.append(cServer)
        fCurrentLife = cServer.mfGetMyCurrentLife()
        lg.logInfo(
            "MAIN", "created server|%s| quality|%s| shelfsize|%s|TB "
            "name|%s| life|%.0f|" %
            (sServerID, nServerQual, nShelfSize, sServerName, fCurrentLife))
        # Invert the server list so that clients can look up
        # all the servers that satisfy a quality criterion.
        if nServerQual in G.dQual2Servers:
            G.dQual2Servers[nServerQual].append([sServerName, sServerID])
        else:
            G.dQual2Servers[nServerQual] = [[sServerName, sServerID]]
        NTRC.ntracef(
            5, "SVRS", "proc makeServers dQual2Servers qual|%s| servers|%s|" %
            (nServerQual, G.dQual2Servers[nServerQual]))
    return G.dQual2Servers
    def mServerIsDead(self, mysServerID, mysCollID):
        '''\
        Auditor calls us: a server is dead, no longer 
         accepting documents.  Remove server from active list, 
         find a new server, populate it.  
        '''
        NTRC.ntracef(3, "CLI", "proc deadserver1 client|%s| place coll|%s| "
            "to|%d|servers" 
            % (self.ID, mysCollID, len(self.lServersToUse)))
        lg.logInfo("CLIENT", "server died cli|%s| removed svr|%s| coll|%s| " 
            % (self.ID, mysServerID, mysCollID))

        cColl = G.dID2Collection[mysCollID]
        cColl.lServerIDs.remove(mysServerID)
        nCollValue = cColl.nValue
        lServersForCollection = self.mSelectServersForCollection(nCollValue)
        # The distribution params have already limited the 
        # set of servers in the select-for-collection routine.
        # If there are servers available, pick one.  Otherwise, 
        #  create a new server that's just like an old one and use it.
        if lServersForCollection:
            sServerToUse = lServersForCollection.pop(0)
        else:
            sServerToUse = CServer.fnsInventNewServer()
        lg.logInfo("CLIENT", "client|%s| assign new server|%s| to replace|%s|" 
            % (self.ID, sServerToUse, mysServerID))
        nDocs = self.mPlaceCollectionOnServer(mysCollID, sServerToUse)
        lg.logInfo("CLIENT", "client|%s| provisioned new server|%s| "
            "collection|%s| ndocs|%s|" 
            % (self.ID, sServerToUse, mysCollID, nDocs))
        self.nServerReplacements += 1
        return sServerToUse
    def mServerIsDead(self, mysServerID, mysCollID):
        '''\
        Auditor calls us: a server is dead, no longer 
         accepting documents.  Remove server from active list, 
         find a new server, populate it.  
        '''
        NTRC.ntracef(3, "CLI", "proc deadserver1 client|%s| place coll|%s| "
            "to|%d|servers" 
            % (self.ID, mysCollID, len(self.lServersToUse)))
        lg.logInfo("CLIENT", "server died cli|%s| removed svr|%s| coll|%s| " 
            % (self.ID, mysServerID, mysCollID))

        cColl = G.dID2Collection[mysCollID]
        cColl.lServerIDs.remove(mysServerID)
        nCollValue = cColl.nValue
        lServersForCollection = self.mSelectServersForCollection(nCollValue)
        # The distribution params have already limited the 
        # set of servers in the select-for-collection routine.
        # If there are servers available, pick one.  Otherwise, 
        #  create a new server that's just like an old one and use it.
        if lServersForCollection:
            sServerToUse = lServersForCollection.pop(0)
        else:
            sServerToUse = CServer.fnsInventNewServer()
        lg.logInfo("CLIENT", "client|%s| assign new server|%s| to replace|%s|" 
            % (self.ID, sServerToUse, mysServerID))
        nDocs = self.mPlaceCollectionOnServer(mysCollID, sServerToUse)
        lg.logInfo("CLIENT", "client|%s| provisioned new server|%s| "
            "collection|%s| ndocs|%s|" 
            % (self.ID, sServerToUse, mysCollID, nDocs))
        self.nServerReplacements += 1
        return sServerToUse
def testAllClients(mylClients):
    for cClient in mylClients:
        lDeadDocIDs = cClient.mTestClient()
        sClientID = cClient.ID
        if len(lDeadDocIDs) > 0:
            if G.bShortLog:
                G.bDoNotLogInfo = True
            for sDocID in lDeadDocIDs:
                cDoc = G.dID2Document[sDocID]
                lg.logInfo(
                    "MAIN", "client |%s| lost doc|%s| size|%s|" %
                    (sClientID, sDocID, cDoc.nSize))
            G.bDoNotLogInfo = False
            lg.logInfo(
                "MAIN",
                "BAD NEWS: Total documents lost by client |%s| in all servers |%d|"
                % (sClientID, len(lDeadDocIDs)))
        else:
            lg.logInfo(
                "MAIN",
                "GOOD NEWS: Total documents lost by client |%s| in all servers |%d|"
                % (sClientID, len(lDeadDocIDs)))

        # Now log stats for the all collections in the client.
        lCollectionIDs = cClient.mListCollectionIDs()
        for sCollID in lCollectionIDs:
            dumpuse.dumpCollectionStats(sCollID)
    def mAge_shelf(self, mynLifeParam):
        ''' An entire shelf fails.  Remove all the docs it contained.
            Eventually, this will trigger a repair event and make the 
            collection more vulnerable during the repair.  
        '''
        fShelfLife = util.makeexpo(mynLifeParam)
        lg.logInfo("SERVER", "mAge_shelf set lifetime time|%6.0f| shelf|%s| "
            "next lifetime|%.3f|khr" 
            % (G.env.now,self.ID,fShelfLife))
        NTRC.tracef(3, "SHLF", "proc mAge_shelf  time|%6.0f| shelf|%s| "
            "next lifetime|%.3f|khr" 
            % (G.env.now,self.ID,fShelfLife))
        yield G.env.timeout(fShelfLife)

        # S H E L F  F A I L S 
        G.nTimeLastEvent = G.env.now
        self.bAlive = False         # Shelf can no longer be used to store docs.
        NTRC.tracef(3, "SHLF", "proc mAge_shelf  time|%d| shelf|%s| shelf_error" 
            % (G.env.now,self.ID))
        lg.logInfo("SERVER", "storage shelf failed time|%6.0f| server|%s| "
            "shelf|%s| lost |%d| docs" 
            % (G.env.now,self.sServerID,self.ID,len(self.lCopyIDs)))
        # This whole shelf is a goner.  Kill it. 
        NTRC.tracef(5, "SHLF", "proc mAge_shelf kill contents ldocs|%s| "
            "lcopies|%s|" 
            % (self.lDocIDs,self.lCopyIDs)) 
        # Note that we have to copy the list before modifying it and 
        # iterate over the copy of the list.  
        # Standard problem with updating an iterable inside the for loop.
        templCopyIDs = copy.deepcopy(self.lCopyIDs)
        for sCopyID in templCopyIDs:
            sDocID = G.dID2Copy[sCopyID].sDocID
            self.mDestroyCopy(sCopyID)
#            G.dID2Server[self.sServerID].mDestroyDocument(sDocID,self.ID)
            G.dID2Server[self.sServerID].mDestroyCopy(sCopyID,sDocId,self.ID)
            self.mReportDocumentLost(sDocID)
        NTRC.tracef(3, "FAIL", "proc t|%d| shelf failure server|%s| qual|%d| "
            "shelf|%s| docs|%d|" 
            % (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual, 
            self.ID,len(templCopyIDs)))
 def mWaitForShockToHappen(self, mynHalflife):
     ''' 
     Generator that waits for shock event.  
     
     Infinite loop:
     - Schedule shock event
     - Execute shock event
     - Schedule end of shock (maybe infinite)
     - Execute end of shock
     '''
     while True:
         # Schocks happen every so often, not just once.  
         fNewLife = util.makeshocklife(mynHalflife)
         lg.logInfo("SHOCK ", "t|%6.0f| waiting for shock in|%.0f| "
             "from hl|%s| at|%.0f|" 
             % (G.env.now, fNewLife, mynHalflife, (G.env.now+fNewLife)))
         # Suspend action until shock happens.
         yield G.env.timeout(fNewLife)
         # Shock has happened.
         lg.logInfo("SHOCK ", "t|%6.0f| shock happens now, maxlife|%s|" 
             % (G.env.now, self.nMaxlife))
         self.mShockHappens()
         # If maxlife nonzero, then wait and expire shock;
         #  else, never expires, so wait forever and don't 
         #  start another shock cycle.
         if self.nMaxlife > 0:
             lg.logInfo("SHOCK ", "t|%6.0f| waiting for shock to expire "
                 "in|%.0f| at|%.0f|" 
                 % (G.env.now, self.nMaxlife, (G.env.now+self.nMaxlife)))
             yield G.env.timeout(self.nMaxlife)
             self.mShockExpires()
         else:
             yield G.env.timeout(G.fInfinity)
Esempio n. 44
0
 def mReduceSomeServerLifetimes(self, mynSpan, mynImpact):
     ''' 
     Find a shockspan-wide subset of servers and reduce their
     expected lifetimes by the stated reduction percentage.
     '''
     lServersToShock = server.CServer.fnlSelectServerVictims(mynSpan)
     fReduction = mynImpact * 1.0 / 100.0
     NTRC.ntracef(
         3, "SHOK",
         "proc reduce servers|%s| by|%s|" % (lServersToShock, fReduction))
     for sServerID in lServersToShock:
         lg.logInfo(
             "SHOCK ", "t|%6.0f| reduce svr|%s| life by pct|%s|" %
             (G.env.now, sServerID, self.nImpact))
         cServer = G.dID2Server[sServerID]
         fOriginalLife = float(cServer.mfGetMyOriginalLife())
         if fOriginalLife > 0:
             self.mReduceSingleServerLifetime(sServerID, fReduction)
             self.lsServersShocked.append(sServerID)
         else:
             lg.logInfo(
                 "SHOCK ", "t|%6.0f| cannot reduce svr|%s| life|%.0f|" %
                 (G.env.now, sServerID, fOriginalLife))
def testAllClients(mylClients):
    for cClient in mylClients:
        lDeadDocIDs = cClient.mTestClient()
        sClientID = cClient.ID
        if len(lDeadDocIDs) > 0:
            if G.bShortLog:
                G.bDoNotLogInfo = True
            for sDocID in lDeadDocIDs:
                cDoc = G.dID2Document[sDocID]
                lg.logInfo("MAIN","client |%s| lost doc|%s| size|%s|" % (sClientID,sDocID,cDoc.nSize))
            G.bDoNotLogInfo = False
            lg.logInfo("MAIN","BAD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID,len(lDeadDocIDs)))
        else:
            lg.logInfo("MAIN","GOOD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID,len(lDeadDocIDs)))
        
        # Now log stats for the all collections in the client.
        lCollectionIDs = cClient.mListCollectionIDs()
        for sCollID in lCollectionIDs:
            dumpuse.dumpCollectionStats(sCollID)
Esempio n. 46
0
 def mWaitForShockToHappen(self, mynHalflife):
     ''' 
     Generator that waits for shock event.  
     
     Infinite loop:
     - Schedule shock event
     - Execute shock event
     - Schedule end of shock (maybe infinite)
     - Execute end of shock
     '''
     while True:
         # Schocks happen every so often, not just once.
         fNewLife = util.makeshocklife(mynHalflife)
         lg.logInfo(
             "SHOCK ", "t|%6.0f| waiting for shock in|%.0f| "
             "from hl|%s| at|%.0f|" % (G.env.now, fNewLife, mynHalflife,
                                       (G.env.now + fNewLife)))
         # Suspend action until shock happens.
         yield G.env.timeout(fNewLife)
         # Shock has happened.
         lg.logInfo(
             "SHOCK ", "t|%6.0f| shock happens now, maxlife|%s|" %
             (G.env.now, self.nMaxlife))
         self.mShockHappens()
         # If maxlife nonzero, then wait and expire shock;
         #  else, never expires, so wait forever and don't
         #  start another shock cycle.
         if self.nMaxlife > 0:
             lg.logInfo(
                 "SHOCK ", "t|%6.0f| waiting for shock to expire "
                 "in|%.0f| at|%.0f|" % (G.env.now, self.nMaxlife,
                                        (G.env.now + self.nMaxlife)))
             yield G.env.timeout(self.nMaxlife)
             self.mShockExpires()
         else:
             yield G.env.timeout(G.fInfinity)
 def mReduceSingleServerLifetime(self, mysServerID, myfReduction):
     '''
     Reduce the lifetime of a single server. 
     
     Two possible methods, selected by a globaldata const nShockType.
     - 1: lifetime, which was already a random from a distribution
          with the standard server half-life, is then reduced 
          by some percentage during the shock period.
     - 2: lifetime during the shock period is a new random 
          chosen from a distribution with half-life reduced 
          *from its current lifetime* by the shock percentage.  
     '''
     cServer = G.dID2Server[mysServerID]
     fCurrentLife = cServer.mfGetMyCurrentLife()
     fOriginalLife = cServer.mfGetMyOriginalLife()
     # Hack to experiment with the two types of shock to see if they
     #  are statistically different.  
     if G.nShockType == 1:
         # Type 1: Lifetime during the shock period is the 
         #  reduction of the original lifetime by the given 
         #  percentage.
         #  That is, the server gets a single life expectation at
         #  birth, and it may be reduced by a shock and then 
         #  restored at the end of the shock period, provided
         #  that it has not expired during the shock period.  
         fNewLifeParam = (1.0 - myfReduction) * fCurrentLife
         # Lifetime cannot actually be zero for 100% reduction, so
         #  make it just really, really small, like 2 hours.  
         fNewLifeParam = max(fNewLifeParam, 2.0)
         NTRC.ntracef(3, "SHOK", "proc shock1 at t|%8.0f| svr|%s| new"
             "lifeparam|%.0f| shocktype|%s|" 
             % (G.env.now, mysServerID, fNewLifeParam, G.nShockType))
         fNewLife = fNewLifeParam
     elif G.nShockType == 2: 
         # Type 2: lifetime during shock period is a new
         #  random chosen from a distribution with less than the lifetime
         #  of the old one.  
         fNewLifeParam = (1.0 - myfReduction) * fOriginalLife
         # Lifetime cannot actually be zero for 100% reduction, so
         #  make it just really, really small, like 2 hours.  
         fNewLifeParam = max(fNewLifeParam, 2.0)
         NTRC.ntracef(3, "SHOK", "proc shock1 at t|%8.0f| svr|%s| new"
             "lifeparam|%.0f| shocktype|%s|" 
             % (G.env.now, mysServerID, fNewLifeParam, G.nShockType))
         fNewLife = util.makeserverlife(fNewLifeParam)
     else:
         NTRC.ntrace(0, "SHOK", "proc ERROR  at t|%8.0f| svr|%s| "
             "unknown shock type|%s|" 
             % (G.env.now, mysServerID, G.nShockType))            
         # Should throw a bugcheck fatal error at this point.
         
     NTRC.ntracef(3, "SHOK", "proc shock2 at t|%8.0f| svr|%s| new"
         "life|%.0f| shocktype|%s|" 
         % (G.env.now, mysServerID, fNewLife, G.nShockType))
     lg.logInfo("SHOCK ", "t|%6.0f| reduce svr|%s| life by|%s| from|%.0f| to"
         "|%.0f| shocktype|%s|" 
         % (G.env.now, mysServerID, myfReduction, fOriginalLife, fNewLife, 
         G.nShockType))
     cServer.mRescheduleMyLife(fNewLife)
     cServer.mSetServerInShock(True)
     return
 def mKillShelf(self):
     ''' Declare shelf el croako and empty it of documents. '''
     lg.logInfo("SHELF ", "t|%6.0f| kill storage shelf|%s| of server|%s|" 
         % (G.env.now, self.ID, self.sServerID))
     self.bAlive = False
     self.mDestroyShelf()
    def mAuditSegment(self, mynThisSegment, mylDocs, mysCollectionID, 
            myeCallerSyncEvent):
        '''\
        SimPy generator to audit one segment of a collection.
        This does all the work.  
        This is the single worst, most confusing, most fragile, and 
         most awful code in the entire program.  Unfortunately, in 
         Python 2, one cannot yield from a vanilla function, only
         from a generator, so all that crap, and its convoluted 
         conditional logic, is in here.  
         *This* is the meanest, nastiest, ugliest father-raper of them all.
        '''

        lg.logInfo("AUDIT2", "begin segmt t|%10.3f| auditid|%s| cycle|%s| "
            "seg|%s| cli|%s| coll|%s| ndocs|%s|range %s-%s|" 
            % (G.env.now, self.ID, self.nNumberOfCycles, mynThisSegment, 
            self.sClientID, self.sCollectionID, len(mylDocs), 
            mylDocs[0], mylDocs[-1]))
    
        ###seize network resource
        # Seize the network resource so this audit cycle 
        # can use it exclusively.
        # The "with" should take care of releasing it
        cClient = G.dID2Client[self.sClientID]
        with cClient.NetworkBandwidthResource.request() as reqnetwork:
            fNetworkWaitBegin = G.env.now

            ###wait if necessary
            result = yield reqnetwork       # Wait for network to be free.
            fNetworkWaitEnd = G.env.now
            fNetworkWaitTime = fNetworkWaitEnd - fNetworkWaitBegin

            ###log result
            # Log event if we had to wait, or not, for the network to be free.  
            lg.logInfo("AUDIT2", "grabnetwork t|%10.3f| auditid|%s| cli|%s| "
                "coll|%s| seg|%s| delay|%9.3f|" 
                % (G.env.now, self.ID, self.sClientID, self.sCollectionID, 
                mynThisSegment, fNetworkWaitTime))
            # And restart the duration clock after the unproductive wait.
            fTimeCycleBegin = G.env.now
            # So much for timekeeping.  Now do some actual work.

            # P h a s e  0: Check to see if any servers have died of old age, 
            #  possibly from being weakened by shock.  If so, they get killed
            #  now so that this audit segment will discover the loss.  
            nResult = CShock.cmBeforeAudit()

            # P h a s e  1: Check servers for copies of docs, record losses.
            # Docs already permanently lost will not be put on the damaged list.
            self.dlDocsDamagedOnServers = cc.defaultdict(list)
            cCollection = G.dID2Collection[mysCollectionID]
            # foreach server used for this collection
            for sServerID in cCollection.lServerIDs:
                cServer = G.dID2Server[sServerID]
                ###foreach doc
                # foreach doc in this segment
                for sDocID in self.lDocsThisSegment:
                    cDoc = G.dID2Document[sDocID]
                    # If the doc is still on the server, retrieve it
                    #  and spend time doing that.
                    # If not, then record that doc damaged on this server. 
                    fTransferTime = self.mRetrieveDoc(sDocID,sServerID)
    
                    ###if okay
                    if fTransferTime:
                        NTRC.tracef(3, "AUD2", "proc AuditSegment3 retrieve "
                            "t|%10.3f| doc|%s| svr|%s| xfrtim|%f|" 
                            % (G.env.now, sDocID, sServerID, fTransferTime))
                        ###yield timeout
                        yield G.env.timeout(fTransferTime)
                    else:
                        if self.mIsDocumentLost(sDocID):
                            pass    # Do not complain if doc already known to be lost.
                        else:
                            # If copy is missing here, save server in 
                            #  lost-list for doc.
                            self.dlDocsDamagedOnServers[sDocID].append(sServerID)
                            NTRC.tracef(5, "AUD2", "proc AuditSegment2 doc|%s| "
                                "svr|%s| lost on|%s|" 
                                % (sDocID, sServerID, 
                                self.dlDocsDamagedOnServers[sDocID]))
                            ###log copy missing on some server
                            lg.logInfo("AUDIT2", "copymissing t|%10.3f| "
                                "doc|%s| svr|%s| aud|%s-c%s-s%s| cli|%s| "
                                "coll|%s|" 
                                % (G.env.now, sDocID, sServerID, self.ID, 
                                self.nNumberOfCycles, mynThisSegment, 
                                self.sClientID, self.sCollectionID))
                # end foreach doc
            # end foreach server used for collection

            '''NOTE: Phase 2 here can be factored out of this function entirely
                because it does not yield or otherwise molest the clock.
                But refactoring must be done carefully because it consumes
                and supplies data from phases 1 and 3.  
            '''

            # P h a s e  2: Record severity (majority/minority/permanent) of copy losses.
            # NOTE: This arithmetic seems to be reasonable for all numbers
            #  greater than two, but one copy remaining out of two is judged 
            #  to be a majority, so a repair from that single remaining copy
            #  is labeled a majority repair.  Seems kinda wrong.  
            # Would love to split the logic of this routine into separate
            #  functions; when you're indented seven levels, your logic is,
            #  um, hard to explain.  But we cannot yield from sub-functions, 
            #  at least not in Python2.  
            nServers = len(cCollection.lServerIDs)
            nMajority = (len(cCollection.lServerIDs)+1) / 2 # recall that
                                                            #  int div truncates

            ###foreach doc on damaged list
            for sDocID in sorted(self.dlDocsDamagedOnServers.keys(), 
                key=util.fniNumberFromID):

                ###count docs on all servers
                lDocLostOnServers = self.dlDocsDamagedOnServers[sDocID]
                nCopiesLost = len(lDocLostOnServers)
                nCopiesLeft = nServers - nCopiesLost
                # How many copies left: none, a lot, a few?
                NTRC.tracef(3, "AUD2", "proc AuditSegment1 doc|%s| nsvr|%s| "
                    "loston|%s| nleft|%s|" 
                    % (sDocID, nServers, lDocLostOnServers, nCopiesLeft))

                ###if doc not lost
                ###    assess majority/minority/lost
                if nCopiesLeft == 0:                    # N O N E  remain
                    # Report permanent loss, one ping only.
                    # Do not double-count docs already lost.  Doc will not
                    #  be put onto damaged list if already lost.
                    sRepair = "permloss"
                    lg.logInfo("AUDIT2", "perm loss   t|%10.3f| doc|%s| "
                        "aud|%s-c%s-s%s| cli|%s| coll|%s|" 
                        % (G.env.now, sDocID, self.ID, self.nNumberOfCycles, 
                        mynThisSegment, self.sClientID, self.sCollectionID))
                    self.mRecordDocumentLost(sDocID)
                else:
                    ###doc is repairable; determine majority/minority
                    if nCopiesLeft >= nMajority:      # M A J O R I T Y  remain
                        sRepair = "majority"
                    else:                             # M I N O R I T Y  remain
                        sRepair = "minority"
                    ###log repair type for doc
                    lg.logInfo("AUDIT2", "%s rp t|%10.3f| doc|%s| "
                        "aud|%s-c%s-s%s| cli|%s| coll|%s|" 
                        % (sRepair, G.env.now, sDocID, self.ID, 
                        self.nNumberOfCycles, mynThisSegment, self.sClientID, 
                        self.sCollectionID))

                # P h a s e  3: repair damaged docs, if possible.
                ###foreach server on which doc was damaged
                # Put a copy back on each server where it is missing.  
                for sServerID in lDocLostOnServers:
                    if nCopiesLeft > 0:
                        ###repair
                        fTransferTime = self.mRepairDoc(sDocID,sServerID)
                        '''\
                        If the repair returns False instead of a time, 
                        then that server is no longer accepting documents.
                        Remove that server from the list, invalidate all 
                        its copies.  Then tell the client to find a new 
                        server and re-place the entire collection.  
                        Schedule this notification to occur at the end of the
                        audit cycle or segment to avoid confusing the 
                        ongoing evaluation.  Auditor informs client: oops,
                        you seem to be missing a server; and client takes
                        corrective action at that time.  
                        Send collectionID and serverID to clientID.
                        '''
    
                        ###if not okay ie server dead
                        if fTransferTime == False:
                            self.stDeadServerIDs.add((sServerID, 
                                self.sCollectionID))
                            lg.logInfo("AUDIT2", "dead server t|%10.3f| "
                                "doc|%s| aud|%s| cli|%s| coll|%s| svr|%s|" 
                                % (G.env.now, sDocID, self.ID, self.sClientID, 
                                self.sCollectionID, sServerID))
                        else:
                            ###log repair effected
                            NTRC.tracef(3, "AUD2", "proc AuditSegment4 repair "
                                "t|%10.3f| doc|%s| svr|%s| xfrtim|%f| type|%s|" 
                                % (G.env.now, sDocID, sServerID, fTransferTime, 
                                sRepair))
                            yield G.env.timeout(float(fTransferTime))
                            lg.logInfo("AUDIT2", "repair doc  t|%10.3f| "
                                "doc|%s| aud|%s| cli|%s| coll|%s| svr|%s| "
                                "from %s copies|%d|" 
                                % (G.env.now, sDocID, self.ID, self.sClientID, 
                                self.sCollectionID, sServerID, sRepair, 
                                nCopiesLeft))
    
                            ###count repair as type maj/min for audit and doc
                            # If repair succeeded, record and count it.
                            if sRepair == "majority":
                                self.mRecordDocumentMajorityRepair(sDocID)
                            else:
                                self.mRecordDocumentMinorityRepair(sDocID)
                # end foreach server that lost this doc
            # end foreach damaged doc

            lg.logInfo("AUDIT2", "end   segmt t|%10.3f| auditid|%s| "
                "cycle|%s| seg|%s| cli|%s| coll|%s| ndocs|%s|" 
                % (G.env.now, self.ID, self.nNumberOfCycles, mynThisSegment, 
                self.sClientID, self.sCollectionID, len(mylDocs)))
    
            # After all that, tell the caller we finished.
            myeCallerSyncEvent.succeed(value=mynThisSegment)
            lg.logInfo("AUDIT2", "rls network t|%10.3f| auditid|%s| "
                "cli|%s| coll|%s| seg|%s|" 
                % (G.env.now, self.ID, self.sClientID, self.sCollectionID, 
                mynThisSegment))
        # end network resource

        # If we saw any dead servers during this segment, inform the clients.
        for (sDeadServerID, sDeadCollectionID) in self.stDeadServerIDs:
            cCollection = G.dID2Collection[self.sCollectionID]
            cClient = G.dID2Client[cCollection.sClientID]
            NTRC.ntracef(3, "AUD2", "proc t|%10.3f| inform dead server "
                "auditid|%s| cli|%s| coll|%s| svr|%s| doc|%s|" 
                % (G.env.now, self.ID, self.sClientID, self.sCollectionID, 
                sServerID, sDocID))
            cClient.mServerIsDead(sDeadServerID, sDeadCollectionID)
        self.stDeadServerIDs = set()
    def mAge_sector(self):
        ''' A sector in the shelf fails.  This corrupts a document.
            For the moment, assume that it destroys the document.  
            Eventually, it will have a probability of destroying the 
            document depending on the portion of the document 
            corrupted and the sensitivity of the document to corruption
            (e.g., compressed or encrypted), or the failure hits an
            encryption or license key.  
        '''
        # If the shelf has been emptied by a shelf failure, stop 
        # caring about sector failures.
        while self.bAlive:
            # Sector lifetime depends on shelf lifetime and glitch age.
            fNow = G.env.now
            cLifetime = G.dID2Lifetime[self.sSectorLifetimeID]
            fLifetimeNow = cLifetime.mfCalcCurrentSectorLifetime(fNow)
            fSectorLifeInterval = util.makeexpo(fLifetimeNow)
            NTRC.tracef(3, "SHLF", "proc mAge_sector time|%d| shelf|%s| "
                "next interval|%.3f|hr from life rate|%.3f|hr" 
                % (G.env.now, self.ID, fSectorLifeInterval, fLifetimeNow))
            yield G.env.timeout(fSectorLifeInterval)

            # S E C T O R  E R R O R
            self.nSectorHits += 1
            G.nTimeLastEvent = G.env.now
            NTRC.tracef(3, "SHLF", "proc mAge_sector time|%d| shelf|%s| "
                "Sector_error hits|%d| emptyhits|%d|" 
                % (G.env.now, self.ID, self.nSectorHits, self.nEmptySectorHits))

            # Select a victim Document, probability proportional to size.
            # Small error, size=1.  What doc dies as a result?
            sCopyVictimID = self.mSelectVictimCopy(mynErrorSize=1)

            # New version: compress strings of consecutive misses into single line.
            # Normally we log one line per error regardless of whether it hits or 
            # misses a document.  That results in hideously long log files for 
            # sparse storage structures, like small docs on large shelf. 
            # Count consecutive misses, and issue one summary line before the 
            # next hit.
            # CANDIDATE FOR REFACTORING
            if sCopyVictimID:               # Hidden error in victim doc.
                # Destroy copy on this shelf.
                cCopy = G.dID2Copy[sCopyVictimID]
                sDocID = cCopy.mGetDocID()
                self.mDestroyCopy(sCopyVictimID)
                # Log the summary line if we just ended a string of misses
                if self.nConsecutiveMisses > 0:
                    lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| "
                        "shelf|%s| consecutive misses|%d|" 
                        % (G.env.now, self.sServerID, self.ID, 
                        self.nConsecutiveMisses))
                self.nConsecutiveMisses = 0
                lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| "
                    "shelf|%s| hidden failure in copy|%s| doc|%s|" 
                    % (G.env.now,self.sServerID,self.ID,sCopyVictimID,sDocID))
                NTRC.tracef(3, "FAIL", "proc t|%d| sector failure server|%s| "
                    "qual|%d| shelf|%s| doc|%s| copy|%s|" 
                    % (G.env.now, self.sServerID, 
                    G.dID2Server[self.sServerID].nQual, self.ID, sDocID, 
                    sCopyVictimID))
            else:                           # No victim, hit empty space.
                self.nEmptySectorHits += 1
                NTRC.tracef(3, "SHLF", "proc mAge_sector shelf|%s| "
                    "sector error fell in empty space" 
                    % (self.ID))
                if self.nConsecutiveMisses == 0:
                    lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| "
                        "shelf|%s| hidden failure in copy|%s|" 
                        % (G.env.now, self.sServerID, self.ID, sCopyVictimID))
                self.nConsecutiveMisses += 1
                NTRC.tracef(3, "FAIL", "proc t|%d| sector failure server|%s| "
                    "qual|%d| shelf|%s| copy|%s|" 
                    % (G.env.now, self.sServerID, 
                    G.dID2Server[self.sServerID].nQual, self.ID, sCopyVictimID))
            # Initiate a repair of the dead document.
            # BZZZT NYI: currently all such failures are silent, so they are 
            #  not detected by the client until audited (or end of run).  
        # Shelf is no longer alive, so we do not notice or schedule 
        #  future sector errors.  Log the event.  
        lg.logInfo("SHELF ", "t|%6.0f| dead shelf|%s| of svr|%s|, "
            "no future errors" 
            % (G.env.now, self.ID, self.sServerID))
def makeClients(mydClients):
    for sClientName in mydClients:
        cClient = client.CClient(sClientName,mydClients[sClientName])
        G.lAllClients.append(cClient)
        lg.logInfo("MAIN","created client|%s|" % (cClient.ID))
    return G.lAllClients
def dumpShockStats():
    lg.logInfo("MAIN","SHOCKS Total shocks|%d| deaths due to shock|%d: %s|" 
        % (G.nShocksTotal, G.nDeathsDueToShock, G.lDeathsDueToShock))
    lg.logInfo("MAIN","SHOCKS When nshocks|%d| at times|%s|" 
        % (G.nShocksTotal, G.lShockTimes))
 def mIdentifySegment(self, mysCollectionID, mynSegments, iCurrentSegment):
     # Get list of document IDs in the collection
     cCollection = G.dID2Collection[mysCollectionID]
     lDocIDsRemaining = cCollection.mListDocumentsRemaining()
     nDocsRemaining = len(lDocIDsRemaining)
     # Beware the case where there are fewer docs remaining alive
     #  than are normally called for in the segment.  
     nDocsMaybe = min( 
                     self.mCalcSegmentSize(mysCollectionID, mynSegments), 
                     nDocsRemaining
                     )
     if 1: # Which method do we choose today?  
           # TRUE = generous assumptions, each segment will audit the 
           #  segment's size number of unique documents WITHOUT REPLACEMENT.
           # FALSE = naive assumptions, each segment will choose the 
           #  segment's size number of document WITH REPLACEMENT.  
         # Use set() to ensure that a doc is not sampled twice in a segment.
         #  This is sampling WITHOUT REPLACEMENT FOR A SEGMENT ONLY, to 
         #  ensure that the right portion of the population gets audited.  
         #  (I don't know what people who say, "audit one quarter of the 
         #  population every quarter" actually mean.  This is a plausible
         #  but optimistic guess.)
         #  Note that the cycle is still sampled WITH repacement overall, so 
         #  that some docs may be missed, say, quarter to quarter.  
         # For sampling with replacement, use a list and just append doc IDs
         #  to the list.  Then a doc might be sampled > once per segment, 
         #  which would be even more useless.  
         # Also, beware the case where there are no docs remaining.  
         #  Carefully return an empty list.
         setDocsThisSegment = set()
         while nDocsMaybe > 0 and len(setDocsThisSegment) < nDocsMaybe:
             idxChoose = int(util.makeunif(0, nDocsRemaining))
             sDocID = lDocIDsRemaining[idxChoose]
             cDoc = G.dID2Document[sDocID]
             # If the doc is not already permanently kaput, check it.
             if not cDoc.mIsLost():
                 setDocsThisSegment.add(sDocID)
         lDocsThisSegment = util.fnlSortIDList(list(setDocsThisSegment))
         sRandomType = "GENEROUS"
     else:
         # Do this the simple, stupid way: select a segment's size list
         #  of uniform random numbers and make that the audit list for 
         #  this segment.  Note that this will be SAMPLING WITH REPLACEMENT
         #  WITHIN A SEGMENT, so that the number of documents actually 
         #  audited during a segment will (almost certainly) be much less 
         #  than the naively-desired number due to balls-in-urns effects
         #  of replacement.  I think that this is what most people
         #  would actually implement naively for "random auditing."  
         #  The list may/will contain duplicates, and we don't care.  
         lDocsThisSegment = []
         while nDocsMaybe > 0 and len(lDocsThisSegment) < nDocsMaybe:
             idxChoose = int(util.makeunif(0, nDocsRemaining))
             sDocID = lDocIDsRemaining[idxChoose]
             cDoc = G.dID2Document[sDocID]
             # If the doc is not already permanently kaput, check it.
             if not cDoc.mIsLost():
                 lDocsThisSegment.append(sDocID)
         lDocsThisSegment = util.fnlSortIDList(list(set(lDocsThisSegment)))
         sRandomType = "NAIVE"
     lg.logInfo("AUDIT2", "choose seg  t|%10.3f| auditid|%s| seg|%s|of|%s| "
             "ndocsrem|%s| chosen|%s| type|%s|"
         % (G.env.now, self.ID, iCurrentSegment+1, mynSegments, 
             nDocsRemaining, len(lDocsThisSegment), sRandomType))
     return lDocsThisSegment