def mKillShelf(self): ''' Declare shelf el croako and empty it of documents. ''' lg.logInfo( "SHELF ", "t|%6.0f| kill storage shelf|%s| of server|%s|" % (G.env.now, self.ID, self.sServerID)) self.bAlive = False self.mDestroyShelf()
def cmBeforeAudit(self): ''' Before each audit cycle, check to see if any servers have exceeded their lifetimes. ''' for (sServerID, cServer) in (util.fnttSortIDDict(G.dID2Server)): fCurrentLife = cServer.mfGetMyCurrentLife() fFullLife = cServer.mfGetMyFullLife() fBirthday = cServer.mfGetMyBirthday() bServerAlive = not cServer.mbIsServerDead() bServerActive = cServer.bInUse # Log that we are examining this server, # but note if it's already dead. sStatus = "inuse" if bServerActive else "" sStatus = sStatus if bServerAlive else "dead" lg.logInfo("SHOCK ", "t|%6.0f| audit+end check svr|%s| " "life|%.0f|=|%.1f|yr %s" % (G.env.now, sServerID, fFullLife, fFullLife/10000, sStatus)) NTRC.ntracef(3, "SHOK", "proc t|%6.0f| check expir? svr|%s| " "svrdefaulthalflife|%s| born|%s| currlife|%s|" % (G.env.now, sServerID, G.fServerDefaultHalflife, fBirthday, fCurrentLife)) # Check to see if the server's lifetime has expired. bDeadAlready = CShock.cmbShouldServerDieNow(sServerID) return G.nDeadOldServers
def cmBeforeAudit(self): ''' Before each audit cycle, check to see if any servers have exceeded their lifetimes. ''' for (sServerID, cServer) in (util.fnttSortIDDict(G.dID2Server)): fCurrentLife = cServer.mfGetMyCurrentLife() fFullLife = cServer.mfGetMyFullLife() fBirthday = cServer.mfGetMyBirthday() bServerAlive = not cServer.mbIsServerDead() bServerActive = cServer.bInUse # Log that we are examining this server, # but note if it's already dead. sStatus = "inuse" if bServerActive else "" sStatus = sStatus if bServerAlive else "dead" lg.logInfo( "SHOCK ", "t|%6.0f| audit+end check svr|%s| " "life|%.0f|=|%.1f|yr %s" % (G.env.now, sServerID, fFullLife, fFullLife / 10000, sStatus)) NTRC.ntracef( 3, "SHOK", "proc t|%6.0f| check expir? svr|%s| " "svrdefaulthalflife|%s| born|%s| currlife|%s|" % (G.env.now, sServerID, G.fServerDefaultHalflife, fBirthday, fCurrentLife)) # Check to see if the server's lifetime has expired. bDeadAlready = CShock.cmbShouldServerDieNow(sServerID) return G.nDeadOldServers
def dumpServerErrorStats(): (TnHits,TnEmptyHits,TnAboveHiWater,TnMultipleHits) = (0,0,0,0) for sKey in sorted(G.dID2Shelf.keys()): cShelf = G.dID2Shelf[sKey] # Get vector of stats. (sID, sServerID, nQual, nHits, nEmptyHits, bAlive, nAboveHiWater, nMultipleHits) = cShelf.mReportErrorStats() lg.logInfo("MAIN", "SERVERERR1 shelf|%s-%s| qual|%d| totalhits|%d| " "nonempty|%d| empty|%d| alive|%s|" % (sServerID, sID, nQual, nHits, (nHits-nEmptyHits), nEmptyHits,bAlive)) lg.logInfo("MAIN", "SERVERERR2 shelf|%s-%s| qual|%d| totalhits|%d| " "abovehiwater|%d| multiples|%d|" % (sServerID, sID, nQual, nHits, nAboveHiWater, nMultipleHits)) TnHits += nHits TnEmptyHits += nEmptyHits TnAboveHiWater += nAboveHiWater TnMultipleHits += nMultipleHits lg.logInfo("MAIN", "SERVERERRTOTALS totalhits|%d| abovehiwater|%d| " "nonempty|%d| empty|%d| multiples|%d|" % (TnHits, TnAboveHiWater, (TnHits-TnEmptyHits), TnEmptyHits, TnMultipleHits)) lg.logInfo("MAIN","DEADSERVERS ALL n|%d| |%s|" % (len(G.lDeadServers), util.fnlSortIDList(G.lDeadServers))) lg.logInfo("MAIN","DEADSERVERS ACTIVE n|%d| |%s|" % (len(G.lDeadActiveServers), util.fnlSortIDList(G.lDeadActiveServers))) return sServerID+"+"+sID
def mRestoreSomeServerLifetimes(self): ''' For all the servers injured by the shock, restore life. ''' lg.logInfo("SHOCK ", "t|%6.0f| shock end, restoring server lifetimes " "for ids|%s|" % (G.env.now, self.lsServersShocked)) # WARNING: list may be empty if server default life is infinite (zero). for sServerID in self.lsServersShocked: self.mRestoreSingleServerLifetime(sServerID) self.lsServersShocked = [] return
def mRestoreSomeServerLifetimes(self): ''' For all the servers injured by the shock, restore life. ''' lg.logInfo( "SHOCK ", "t|%6.0f| shock end, restoring server lifetimes " "for ids|%s|" % (G.env.now, self.lsServersShocked)) # WARNING: list may be empty if server default life is infinite (zero). for sServerID in self.lsServersShocked: self.mRestoreSingleServerLifetime(sServerID) self.lsServersShocked = [] return
def mPlaceCollectionOnServer(self, mysCollID, mysServerID): # Send copy of collection to server. cServer = G.dID2Server[mysServerID] nDocs = cServer.mAddCollection(mysCollID, self.ID) # Record that this server has a copy of this collection. cColl = G.dID2Collection[mysCollID] cColl.lServerIDs.append(mysServerID) lg.logInfo("CLIENT", "client|%s| placed collection|%s| " "to server|%s|" % (self.ID, mysCollID, mysServerID)) return nDocs
def mPlaceCollectionOnServer(self, mysCollID, mysServerID): # Send copy of collection to server. cServer = G.dID2Server[mysServerID] nDocs = cServer.mAddCollection(mysCollID, self.ID) # Record that this server has a copy of this collection. cColl = G.dID2Collection[mysCollID] cColl.lServerIDs.append(mysServerID) lg.logInfo("CLIENT", "client|%s| placed collection|%s| " "to server|%s|" % (self.ID, mysCollID, mysServerID)) return nDocs
def cmAtEndOfRun(self): ''' At end of run, check to see if any servers have exceeded their lifetimes. It is possible for servers to die from shocks even if there is no auditing, and that counts because we evaluate every doc at the end of run. ''' lg.logInfo("SHOCK ", "t|%6.0f| end of run checking all server lifetimes" % (G.env.now)) nResult = CShock.cmBeforeAudit() return
def mCreateShelf(self): ''' Add a new shelf of the standard size for this Server. Called as needed when a doc arrives too large for available space. ''' cShelf = CShelf(self.ID, self.nQual, self.nShelfSize) lg.logInfo( "SERVER", "server |%s| created storage shelf|%s| " "quality|%s| size|%s|TB svrlifespan|%.0f| svrlife|%.0f|" % (self.ID, cShelf.ID, cShelf.nQual, self.nShelfSizeTB, self.mfGetMyOriginalLife(), self.mfGetMyCurrentLife())) return cShelf.ID
def mShockHappens(self): ''' Shock has happened. Shorten server lives and schedule the end of the shock cycle. ''' G.nShocksTotal += 1 G.lShockTimes.append(int(G.env.now)) lg.logInfo("SHOCK ", "t|%6.0f| start to reduce life of |%s| servers " "by pct|%s|" % (G.env.now, self.nSpan, self.nImpact)) self.mReduceSomeServerLifetimes(self.nSpan, self.nImpact) return G.env.now
def cmAtEndOfRun(self): ''' At end of run, check to see if any servers have exceeded their lifetimes. It is possible for servers to die from shocks even if there is no auditing, and that counts because we evaluate every doc at the end of run. ''' lg.logInfo( "SHOCK ", "t|%6.0f| end of run checking all server lifetimes" % (G.env.now)) nResult = CShock.cmBeforeAudit() return
def dumpServerUseStats(): for sKey in sorted(G.dID2Shelf.keys()): cShelf = G.dID2Shelf[sKey] # Get vector of stats from shelf. (sID,sServerID,nQual,fExpolife,nCapacity,nHiWater,nCurrentUse) = \ cShelf.mReportUseStats() lg.logInfo("MAIN", "SERVERUSE shelf|%s-%s| qual|%d| " "sectorexpolife|%.0f| size|%d| hiwater|%d| currentuse|%d| " "full%%|%d|" % (sServerID, sID, nQual, fExpolife, nCapacity, nHiWater, nCurrentUse, 100*nCurrentUse/nCapacity)) return sServerID+"+"+sID
def mScheduleGlitch(self): '''Wait for a glitch lifetime on this shelf. If the shelf died as a result of the glitch, stop rescheduling. ''' fNow = G.env.now NTRC.tracef( 3, "LIFE", "proc schedule glitch t|%d| shelf|%s| alive|%s|" % (fNow, self.sShelfID, self.cShelf.mbIsShelfAlive())) while 1: fNow = G.env.now bAlive = self.cShelf.mbIsShelfAlive() if bAlive: self.fShelfLife = self.mfCalcCurrentGlitchLifetime(fNow) if self.fShelfLife > 0 and bAlive: self.fShelfInterval = util.makeexpo(self.fShelfLife) lg.logInfo( "LIFETIME", "schedule t|%6.0f| for shelf|%s| " "interval|%.3f| freq|%d| life|%.3f|" % (fNow, self.sShelfID, self.fShelfInterval, self.nGlitchFreq, self.fShelfLife)) NTRC.tracef( 3, "LIFE", "proc schedule glitch shelf|%s| " "interval|%.3f| based on life|%.3f| alive|%s| " "waiting..." % (self.sShelfID, self.fShelfInterval, self.fShelfLife, bAlive)) yield G.env.timeout(self.fShelfInterval) # ****** Glitch has now occurred. ****** # If correlated failure, step entirely outside the # Lifetime-Shelf-Server context to signal several servers. if self.nGlitchSpan > 1: from server import CServer CServer.fnCorrFailHappensToAll(self.nGlitchSpan) else: self.mGlitchHappensNow() else: NTRC.ntracef( 3, "LIFE", "proc glitch no freq or not alive, " "set wait to infinity shelf|%s| freq|%d| life|%.3f| " "interval|%.3f|" % (self.sShelfID, self.nGlitchFreq, self.fShelfLife, self.fShelfInterval)) yield G.env.timeout(G.fInfinity) else: break # Because we have to use fako "while 1". # When shelf is not alive anymore, wait forever NTRC.ntracef( 3, "LIFE", "proc glitch shelf no longer alive, set wait " "to infinity shelf|%s| freq|%d| life|%.3f| interval|%.3f|" % (self.sShelfID, self.nGlitchFreq, self.fShelfLife, self.fShelfInterval)) yield G.env.timeout(G.fInfinity)
def mShockHappens(self): ''' Shock has happened. Shorten server lives and schedule the end of the shock cycle. ''' G.nShocksTotal += 1 G.lShockTimes.append(int(G.env.now)) lg.logInfo( "SHOCK ", "t|%6.0f| start to reduce life of |%s| servers " "by pct|%s|" % (G.env.now, self.nSpan, self.nImpact)) self.mReduceSomeServerLifetimes(self.nSpan, self.nImpact) return G.env.now
def fnTimerInt(objTimer, xContext): '''\ Server life-span timer was interrupted to reschedule it, probably by a shock, and presumably to a shorter life. But the server is still alive. ''' NTRC.trace( 3, "interrupt %s delay %s called from %s at %s." % (xContext, objTimer.delay, objTimer, G.env.now)) lg.logInfo( "SERVER", "interrupted t|%6.0f| context|%s| delay|%s|" % (G.env.now, xContext, objTimer.delay)) return (objTimer, xContext)
def dumpGlitchStats(): for sKey in sorted(G.dID2Lifetime.keys()): cLifetime = G.dID2Lifetime[sKey] dStats = cLifetime.mReportGlitchStats() lg.logInfo("MAIN", "LIFETIME shelf|%s| lifetime|%s| freq|%s| " "impact|%s| decay|%s| maxlife|%s| count|%s| time|%.3f|" % ( dStats["sShelfID"], dStats["sLifetimeID"], dStats["nGlitchFreq"], dStats["nImpactReductionPct"], dStats["nGlitchDecayHalflife"], dStats["nGlitchMaxlife"], dStats["nGlitches"], dStats["fGlitchTime"])) lg.logInfo("MAIN","LIFETIME Total glitches|%d|" % (G.nGlitchesTotal))
def mGlitchHappensNow(self): """Start a glitch happening right now. May be invoked from outside a CLifetime instance as well as from inside.""" fNow = G.env.now NTRC.ntracef( 3, "LIFE", "proc glitch wait expired t|%6.0f| " "for shelf|%s| freq|%d| life|%.3f| interval|%.3f|" % (fNow, self.sShelfID, self.nGlitchFreq, self.fShelfLife, self.fShelfInterval)) self.mGlitchHappens(fNow) lg.logInfo( "LIFETIME", "glitchnow t|%6.0f| for shelf|%s| active|%s|" % (fNow, self.sShelfID, self.bGlitchActive))
def mDestroyShelf(self): ''' Nuke all the copies on the shelf. Can't delete the CShelf object, however. ''' NTRC.ntracef(3, "SHLF", "proc mDestroyShelf1 shelf|%s| " "has ncopies|%s|" % (self.ID, len(self.lCopyIDs))) lg.logInfo("SHELF ", "t|%6.0f| destroy shelf|%s| " "of svr|%s| ncopies|%s|" % (G.env.now, self.ID, self.sServerID, len(self.lCopyIDs))) lAllCopyIDs = self.lCopyIDs[:] # DANGER: list modified inside loop, # requires deepcopy. for sCopyID in lAllCopyIDs: self.mDestroyCopy(sCopyID)
def mDestroyShelf(self): ''' Nuke all the copies on the shelf. Can't delete the CShelf object, however. ''' NTRC.ntracef( 3, "SHLF", "proc mDestroyShelf1 shelf|%s| " "has ncopies|%s|" % (self.ID, len(self.lCopyIDs))) lg.logInfo( "SHELF ", "t|%6.0f| destroy shelf|%s| " "of svr|%s| ncopies|%s|" % (G.env.now, self.ID, self.sServerID, len(self.lCopyIDs))) lAllCopyIDs = self.lCopyIDs[:] # DANGER: list modified inside loop, # requires deepcopy. for sCopyID in lAllCopyIDs: self.mDestroyCopy(sCopyID)
def mRestoreSingleServerLifetime(self, mysServerID): ''' Restore normal lifetime to a single server. ''' bDeadAlready = CShock.cmbShouldServerDieNow(mysServerID) cServer = G.dID2Server[mysServerID] if cServer.mbIsServerDead() or bDeadAlready: lg.logInfo("SHOCK ", "t|%6.0f| cannot restore dead server|%s| life" % (G.env.now, mysServerID)) else: fOriginalLifespan = cServer.mfGetMyOriginalLife() lg.logInfo("SHOCK ", "t|%6.0f| restoring server|%s| life to |%.0f|" % (G.env.now, mysServerID, fOriginalLifespan)) cServer.mRescheduleMyLife(fOriginalLifespan) cServer.mSetServerInShock(False) return mysServerID
def mAuditCycle(self,mynCycleInterval,mynSegments): '''\ SimPy generator to schedule audit cycles for this collection. Starts an async process that ticks every audit cycle forever. ''' # Initially, wait for some small random interval # so that client audit cycles are not synchronized, # like Ethernet collision retry waits. nRandTime = util.makeunif(0,mynCycleInterval/20) # Nope, not any more. No need for the random offset since there is # only one auditor. # yield G.env.timeout(nRandTime) # And now wait for one segment interval before starting the first seg. # Seems odd, but consider an annual audit in quarterly segments: # you don't want to wait a whole year before starting quarterly audits; # start after the first quarter. nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, mynSegments) yield G.env.timeout(nSegmentInterval) while True: lg.logInfo("AUDIT2", "begin cycle t|%10.3f| auditid|%s| type|%s| " "cycle|%s| cli|%s| coll|%s| interval|%s| nsegments|%s|" % (G.env.now, self.ID, self.TYPE, self.nNumberOfCycles, self.sClientID, self.sCollectionID, mynCycleInterval, mynSegments)) # Start the collection audit and wait for it to finish. tCycleStartTime = G.env.now self.nRepairsThisCycle = 0 eSyncEvent = G.env.event() G.env.process( self.mAuditCollection(mynCycleInterval, G.nAuditSegments, self.sCollectionID, eSyncEvent)) yield eSyncEvent lg.logInfo("AUDIT2", "end cycle t|%10.3f| auditid|%s| cycle|%s| " "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| " "majority|%s| minority|%d|" % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, self.nPermanentLosses, self.nRepairsMajority, self.nRepairsMinority)) self.nNumberOfCycles += 1 tNextCycleStartTime = tCycleStartTime + mynCycleInterval yield G.env.timeout(tNextCycleStartTime - G.env.now)
def mAuditCycle(self,mynCycleInterval,mynSegments): '''\ SimPy generator to schedule audit cycles for this collection. Starts an async process that ticks every audit cycle forever. ''' # Initially, wait for some small random interval # so that client audit cycles are not synchronized, # like Ethernet collision retry waits. nRandTime = util.makeunif(0,mynCycleInterval/20) # Nope, not any more. No need for the random offset since there is # only one auditor. # yield G.env.timeout(nRandTime) # And now wait for one segment interval before starting the first seg. # Seems odd, but consider an annual audit in quarterly segments: # you don't want to wait a whole year before starting quarterly audits; # start after the first quarter. nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, mynSegments) yield G.env.timeout(nSegmentInterval) while True: lg.logInfo("AUDIT2", "begin cycle t|%10.3f| auditid|%s| type|%s| " "cycle|%s| cli|%s| coll|%s| interval|%s| nsegments|%s|" % (G.env.now, self.ID, self.TYPE, self.nNumberOfCycles, self.sClientID, self.sCollectionID, mynCycleInterval, mynSegments)) # Start the collection audit and wait for it to finish. tCycleStartTime = G.env.now self.nRepairsThisCycle = 0 eSyncEvent = G.env.event() G.env.process( self.mAuditCollection(mynCycleInterval, G.nAuditSegments, self.sCollectionID, eSyncEvent)) yield eSyncEvent lg.logInfo("AUDIT2", "end cycle t|%10.3f| auditid|%s| cycle|%s| " "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| " "majority|%s| minority|%d|" % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, self.nPermanentLosses, self.nRepairsMajority, self.nRepairsMinority)) self.nNumberOfCycles += 1 tNextCycleStartTime = tCycleStartTime + mynCycleInterval yield G.env.timeout(tNextCycleStartTime - G.env.now)
def fnTimerCall(objTimer, xContext): '''\ Server life-span timer has completed, and the server must die. Set the timer event to release any process waiting for it. Declare the server to be el croako. ''' NTRC.trace( 3, "callback %s delay %s called from %s at %s." % (xContext, objTimer.delay, objTimer, G.env.now)) objTimer.setevent() cServer = xContext[0] cServer.mKillServer() lg.logInfo( "SERVER", "timercalled t|%6.0f| context|%s| delay|%s|" % (G.env.now, xContext, objTimer.delay)) return (objTimer, xContext)
def dumpCollectionStats(mysCollID): cColl = G.dID2Collection[mysCollID] dStats = cColl.mdReportCollectionStats() (sCollIDx,sClientIDx,nServers,nDocs, nDocsOkay, nDocsInjured, nDocsForensics, nDocsLost) = \ (mysCollID, dStats["sClientID"], dStats["nServers"], dStats["nDocs"], dStats["nOkay"], dStats["nRepairsMajority"], dStats["nRepairsMinority"], dStats["nLost"]) lg.logInfo("MAIN", "COLLECTIONTOTALS client|%s| collection|%s| " "nservers|%s| ndocs|%s| nokay|%s| nmajority|%s| nminority|%s| " "nlost|%s| " % (sClientIDx, sCollIDx, nServers, nDocs, nDocsOkay, nDocsInjured, nDocsForensics, nDocsLost))
def mRestoreSingleServerLifetime(self, mysServerID): ''' Restore normal lifetime to a single server. ''' bDeadAlready = CShock.cmbShouldServerDieNow(mysServerID) cServer = G.dID2Server[mysServerID] if cServer.mbIsServerDead() or bDeadAlready: lg.logInfo( "SHOCK ", "t|%6.0f| cannot restore dead server|%s| life" % (G.env.now, mysServerID)) else: fOriginalLifespan = cServer.mfGetMyOriginalLife() lg.logInfo( "SHOCK ", "t|%6.0f| restoring server|%s| life to |%.0f|" % (G.env.now, mysServerID, fOriginalLifespan)) cServer.mRescheduleMyLife(fOriginalLifespan) cServer.mSetServerInShock(False) return mysServerID
def cmbShouldServerDieNow(self, mysServerID): ''' If the server's (possibly reduced) lifetime has expired, kill it rather than restoring it to a full life. ''' cServer = G.dID2Server[mysServerID] fCurrentLife = cServer.mfGetMyCurrentLife() fFullLife = cServer.mfGetMyFullLife() fBirthday = cServer.mfGetMyBirthday() bServerAlive = not cServer.mbIsServerDead() if (G.fServerDefaultHalflife > 0 and fCurrentLife > 0 and fFullLife <= G.env.now and bServerAlive ): # Server has overstayed its welcome. Kill it. sInUse = "currently in use" if cServer.mbIsServerInUse() else "" sShockVictim = "shock victim" if cServer.mbIsServerInShock() else "" lg.logInfo("SHOCK ", "t|%6.0f| kill svr|%s| " "born|%.0f| life|%.0f|=|%.1f|yr " "expired %s %s" % (G.env.now, mysServerID, fBirthday, fCurrentLife, fCurrentLife/10000, sInUse, sShockVictim)) NTRC.ntracef(3, "SHOK", "proc t|%6.0f| expired svr|%s| " "svrdefaulthalflife|%s| born|%.0f| currlife|%.0f|" % (G.env.now, mysServerID, G.fServerDefaultHalflife, fBirthday, fCurrentLife)) result = cServer.mKillServer() G.nDeadOldServers += 1 bResult = True # Now check to see if the server died because of the shock. # Is the current life less than the original life? # Philosophical question: if the shock type 2 caused your new, # recalculated life to be longer than your original life, # can your death reasonably be attributed to the shock? # Answer = no, because without the shock you would have # died even earlier. Tricky, though. fOriginalLife = cServer.mfGetMyOriginalLife() if fCurrentLife < fOriginalLife: G.nDeathsDueToShock += 1 G.lDeathsDueToShock.append(mysServerID) else: bResult = False return bResult
def makeServers(mydServers): for sServerName in mydServers: (nServerQual,nShelfSize) = mydServers[sServerName][0] cServer = server.CServer(sServerName,nServerQual,nShelfSize) sServerID = cServer.ID G.lAllServers.append(cServer) fCurrentLife = cServer.mfGetMyCurrentLife() lg.logInfo("MAIN","created server|%s| quality|%s| shelfsize|%s|TB " "name|%s| life|%.0f|" % (sServerID, nServerQual, nShelfSize, sServerName, fCurrentLife)) # Invert the server list so that clients can look up # all the servers that satisfy a quality criterion. if nServerQual in G.dQual2Servers: G.dQual2Servers[nServerQual].append([sServerName,sServerID]) else: G.dQual2Servers[nServerQual] = [[sServerName,sServerID]] NTRC.ntracef(5,"SVRS","proc makeServers dQual2Servers qual|%s| servers|%s|" % (nServerQual,G.dQual2Servers[nServerQual])) return G.dQual2Servers
def cmbShouldServerDieNow(self, mysServerID): ''' If the server's (possibly reduced) lifetime has expired, kill it rather than restoring it to a full life. ''' cServer = G.dID2Server[mysServerID] fCurrentLife = cServer.mfGetMyCurrentLife() fFullLife = cServer.mfGetMyFullLife() fBirthday = cServer.mfGetMyBirthday() bServerAlive = not cServer.mbIsServerDead() if (G.fServerDefaultHalflife > 0 and fCurrentLife > 0 and fFullLife <= G.env.now and bServerAlive): # Server has overstayed its welcome. Kill it. sInUse = "currently in use" if cServer.mbIsServerInUse() else "" sShockVictim = "shock victim" if cServer.mbIsServerInShock( ) else "" lg.logInfo( "SHOCK ", "t|%6.0f| kill svr|%s| " "born|%.0f| life|%.0f|=|%.1f|yr " "expired %s %s" % (G.env.now, mysServerID, fBirthday, fCurrentLife, fCurrentLife / 10000, sInUse, sShockVictim)) NTRC.ntracef( 3, "SHOK", "proc t|%6.0f| expired svr|%s| " "svrdefaulthalflife|%s| born|%.0f| currlife|%.0f|" % (G.env.now, mysServerID, G.fServerDefaultHalflife, fBirthday, fCurrentLife)) result = cServer.mKillServer() G.nDeadOldServers += 1 bResult = True # Now check to see if the server died because of the shock. # Is the current life less than the original life? # Philosophical question: if the shock type 2 caused your new, # recalculated life to be longer than your original life, # can your death reasonably be attributed to the shock? # Answer = no, because without the shock you would have # died even earlier. Tricky, though. fOriginalLife = cServer.mfGetMyOriginalLife() if fCurrentLife < fOriginalLife: G.nDeathsDueToShock += 1 G.lDeathsDueToShock.append(mysServerID) else: bResult = False return bResult
def dumpAuditStats(): (TnNumberOfCycles, TnRepairsTotal, TnPermanentLosses, TnRepairsMajority, TnRepairsMinority) = (0,0,0,0,0) if G.nAuditCycleInterval: # If there is any auditing in this run,... for sKey in sorted(G.dID2Audit.keys()): cAudit = G.dID2Audit[sKey] # Get vector of stats for one Audit instance. dStats = cAudit.mdReportAuditStats() (ID,sClientID,sCollectionID,sServerID ,nNumberOfCycles,nRepairsTotal ,nPermanentLosses,nRepairsMajority,nRepairsMinority) \ = \ (sKey,dStats["sClientID"],dStats["sCollectionID"],"*" ,dStats["nNumberOfCycles"],dStats["nRepairsTotal"] ,dStats["nPermanentLosses"],dStats["nRepairsMajority"] ,dStats["nRepairsMinority"]) (nFrequency,nSegments) = (dStats["nFrequency"],dStats["nSegments"]) lg.logInfo("MAIN", "AUDITS id|%s| client|%s| coll|%s| server|%s| " "ncycles|%s| nrepairs|%s| nlosses|%s| nmajority|%s| " "nminority|%s|" % (ID, sClientID, sCollectionID, sServerID, nNumberOfCycles, nRepairsTotal, nPermanentLosses, nRepairsMajority, nRepairsMinority)) # Accumulate totals. TnNumberOfCycles += nNumberOfCycles TnRepairsTotal += nRepairsTotal TnPermanentLosses += nPermanentLosses TnRepairsMajority += nRepairsMajority TnRepairsMinority += nRepairsMinority # A couple of these are just declarations, not to be totalled. TnFrequency = nFrequency TnSegments = nSegments else: # If no auditing in this run. TnNumberOfCycles = TnRepairsTotal = 0 TnPermanentLosses = TnRepairsMajority = TnRepairsMinority = 0 TnFrequency = TnSegments = 0 lg.logInfo("MAIN", "AUDITTOTALS ncycles|%s| nfrequency|%s| nsegments|%s| " "nrepairs|%s| nmajority|%s| nminority|%s| nlost|%s| " % (TnNumberOfCycles, TnFrequency, TnSegments, TnRepairsTotal, TnRepairsMajority, TnRepairsMinority, TnPermanentLosses)) return
def fnsInventNewServer(cls): '''Class method: Create another server on the fly. Use the info from some old one that is still alive to create a new one. Change the long name to make it unique. Return the new server ID. ''' tnow = datetime.now() lLiveServerIDs = cls.fnlListLiveServerIDs() sServerID = lLiveServerIDs[0] cServer = G.dID2Server[sServerID] sNewName = (cServer.sName + "_" + util.fnsGetTimeStamp() + "_" + tnow.strftime("%H%M%S.%f")) cNewServer = CServer(sNewName, cServer.nQual, cServer.nShelfSizeTB) lg.logInfo( "SERVER", "created new server|%s| name|%s| " "quality|%s| size|%s|TB svrlife|%.0f|" % (cNewServer.ID, sNewName, cNewServer.nQual, cNewServer.nShelfSizeTB, cNewServer.mfGetMyCurrentLife())) return cNewServer.ID
def mAge_shelf(self, mynLifeParam): ''' An entire shelf fails. Remove all the docs it contained. Eventually, this will trigger a repair event and make the collection more vulnerable during the repair. ''' fShelfLife = util.makeexpo(mynLifeParam) lg.logInfo( "SERVER", "mAge_shelf set lifetime time|%6.0f| shelf|%s| " "next lifetime|%.3f|khr" % (G.env.now, self.ID, fShelfLife)) NTRC.tracef( 3, "SHLF", "proc mAge_shelf time|%6.0f| shelf|%s| " "next lifetime|%.3f|khr" % (G.env.now, self.ID, fShelfLife)) yield G.env.timeout(fShelfLife) # S H E L F F A I L S G.nTimeLastEvent = G.env.now self.bAlive = False # Shelf can no longer be used to store docs. NTRC.tracef( 3, "SHLF", "proc mAge_shelf time|%d| shelf|%s| shelf_error" % (G.env.now, self.ID)) lg.logInfo( "SERVER", "storage shelf failed time|%6.0f| server|%s| " "shelf|%s| lost |%d| docs" % (G.env.now, self.sServerID, self.ID, len(self.lCopyIDs))) # This whole shelf is a goner. Kill it. NTRC.tracef( 5, "SHLF", "proc mAge_shelf kill contents ldocs|%s| " "lcopies|%s|" % (self.lDocIDs, self.lCopyIDs)) # Note that we have to copy the list before modifying it and # iterate over the copy of the list. # Standard problem with updating an iterable inside the for loop. templCopyIDs = copy.deepcopy(self.lCopyIDs) for sCopyID in templCopyIDs: sDocID = G.dID2Copy[sCopyID].sDocID self.mDestroyCopy(sCopyID) # G.dID2Server[self.sServerID].mDestroyDocument(sDocID,self.ID) G.dID2Server[self.sServerID].mDestroyCopy(sCopyID, sDocId, self.ID) self.mReportDocumentLost(sDocID) NTRC.tracef( 3, "FAIL", "proc t|%d| shelf failure server|%s| qual|%d| " "shelf|%s| docs|%d|" % (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual, self.ID, len(templCopyIDs)))
def mAuditCollection(self, mynCycleInterval, mynSegments, mysCollectionID, myeCallerSyncEvent): '''\ SimPy generator to audit an entire collection. Divide the collection into segments and schedule audits for each segment in turn. ''' fTimeCycleBegin = G.env.now lg.logInfo("AUDIT2","begin colln t|%10.3f| auditid|%s| cycle|%s| cli|%s| coll|%s|" % (G.env.now,self.ID,self.nNumberOfCycles,self.sClientID,self.sCollectionID)) for iThisSegment in range(mynSegments): tSegmentStartTime = G.env.now nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, mynSegments) bLastSegment = (iThisSegment == mynSegments-1) self.lDocsThisSegment = self.mIdentifySegment(mysCollectionID, mynSegments, iThisSegment) eSyncEvent = G.env.event() G.env.process( self.mAuditSegment(iThisSegment, self.lDocsThisSegment, mysCollectionID, eSyncEvent)) # Wait for completion of segment and its allotted time. yield eSyncEvent tNextSegmentStartTime = tSegmentStartTime + nSegmentInterval NTRC.tracef(3, "AUD2", "proc AuditCollection1 now|%s| tstart|%s| " "tnext|%s| tinterval|%s| blastseg|%s|" % (G.env.now, tSegmentStartTime, tNextSegmentStartTime, nSegmentInterval, bLastSegment)) yield G.env.timeout(tNextSegmentStartTime - G.env.now) fTimeCycleEnd = G.env.now self.fTimeCycleLength = fTimeCycleEnd - fTimeCycleBegin lg.logInfo("AUDIT2", "end colln t|%10.3f| auditid|%s| cycle|%s| " "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| " "majority|%s| minority|%d| duration|%9.3f|" % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, self.nPermanentLosses, self.nRepairsMajority, self.nRepairsMinority, self.fTimeCycleLength)) # Tell the caller that we finished. myeCallerSyncEvent.succeed(value=self.nNumberOfCycles)
def mAuditCollection(self, mynCycleInterval, mynSegments, mysCollectionID, myeCallerSyncEvent): '''\ SimPy generator to audit an entire collection. Divide the collection into segments and schedule audits for each segment in turn. ''' fTimeCycleBegin = G.env.now lg.logInfo("AUDIT2","begin colln t|%10.3f| auditid|%s| cycle|%s| cli|%s| coll|%s|" % (G.env.now,self.ID,self.nNumberOfCycles,self.sClientID,self.sCollectionID)) for iThisSegment in range(mynSegments): tSegmentStartTime = G.env.now nSegmentInterval = self.mCalcSegmentInterval(mynCycleInterval, mynSegments) bLastSegment = (iThisSegment == mynSegments-1) self.lDocsThisSegment = self.mIdentifySegment(mysCollectionID, mynSegments, iThisSegment) eSyncEvent = G.env.event() G.env.process( self.mAuditSegment(iThisSegment, self.lDocsThisSegment, mysCollectionID, eSyncEvent)) # Wait for completion of segment and its allotted time. yield eSyncEvent tNextSegmentStartTime = tSegmentStartTime + nSegmentInterval NTRC.tracef(3, "AUD2", "proc AuditCollection1 now|%s| tstart|%s| " "tnext|%s| tinterval|%s| blastseg|%s|" % (G.env.now, tSegmentStartTime, tNextSegmentStartTime, nSegmentInterval, bLastSegment)) yield G.env.timeout(tNextSegmentStartTime - G.env.now) fTimeCycleEnd = G.env.now self.fTimeCycleLength = fTimeCycleEnd - fTimeCycleBegin lg.logInfo("AUDIT2", "end colln t|%10.3f| auditid|%s| cycle|%s| " "cli|%s| coll|%s| repairs|%d| total|%d| perms|%d| " "majority|%s| minority|%d| duration|%9.3f|" % (G.env.now, self.ID, self.nNumberOfCycles, self.sClientID, self.sCollectionID, self.nRepairsThisCycle, self.nRepairsTotal, self.nPermanentLosses, self.nRepairsMajority, self.nRepairsMinority, self.fTimeCycleLength)) # Tell the caller that we finished. myeCallerSyncEvent.succeed(value=self.nNumberOfCycles)
def mReduceSomeServerLifetimes(self, mynSpan, mynImpact): ''' Find a shockspan-wide subset of servers and reduce their expected lifetimes by the stated reduction percentage. ''' lServersToShock = server.CServer.fnlSelectServerVictims(mynSpan) fReduction = mynImpact * 1.0 / 100.0 NTRC.ntracef(3, "SHOK", "proc reduce servers|%s| by|%s|" % (lServersToShock, fReduction)) for sServerID in lServersToShock: lg.logInfo("SHOCK ", "t|%6.0f| reduce svr|%s| life by pct|%s|" % (G.env.now, sServerID, self.nImpact)) cServer = G.dID2Server[sServerID] fOriginalLife = float(cServer.mfGetMyOriginalLife()) if fOriginalLife > 0: self.mReduceSingleServerLifetime(sServerID, fReduction) self.lsServersShocked.append(sServerID) else: lg.logInfo("SHOCK ", "t|%6.0f| cannot reduce svr|%s| life|%.0f|" % (G.env.now, sServerID, fOriginalLife))
def mTestCollection(self): ''' Return a list, maybe empty, of documents declared missing from this collection. ''' bOldLogState = G.bDoNotLogInfo if G.bShortLog: G.bDoNotLogInfo = True lDeadDocIDs = list() for sDocID in self.lDocIDs: cDoc = G.dID2Document[sDocID] (bOkay, bInjured, bForensics, bLost) = cDoc.mTestCopies() NTRC.ntracef( 3, "COLL", "proc TestColl1 coll|%s| tests doc|%s| " "okay|%s| injured|%s| forensics|%s| lost|%s|" % (self.ID, sDocID, bOkay, bInjured, bForensics, bLost)) # Merge new info with old info from audits. (bOkay,bInjured,bForensics,bLost) = \ cDoc.mMergeEvaluation(bOkay,bInjured,bForensics,bLost) # Update stats of document statuses. self.nDocsOkay += 1 if bOkay else 0 self.nDocsMajorityRepair += 1 if bInjured else 0 self.nDocsMinorityRepair += 1 if bForensics else 0 self.nDocsLost += 1 if bLost else 0 # Update lost list. if bLost: lDeadDocIDs.append(sDocID) NTRC.ntracef( 3, "COLL", "proc TestColl2 dead doc|%s| in coll|%s| " % (sDocID, self.ID)) NTRC.ntracef( 3, "COLL", "proc TestColl3 coll|%s| doc|%s| okay|%s| " "majority|%s| minority|%s| lost|%s|" % (self.ID, sDocID, bOkay, bInjured, bForensics, bLost)) if not bOkay: (nMajority, nMinority) = cDoc.mGetRepairCounts() lg.logInfo( "DOCUMENT", "doc injured cli|%s| coll|%s| doc|%s| " "majority|%s|%s| minority|%s|%s| lost|%s|" % (self.sClientID, self.ID, sDocID, bInjured, nMajority, bForensics, nMinority, bLost)) G.bDoNotLogInfo = bOldLogState return lDeadDocIDs
def mGlitchHappens(self, myfNow): self.bGlitchActive = True self.nGlitches += 1 G.nGlitchesTotal += 1 lg.logInfo( "LIFETIME", "glitch t|%6.0f| on shelf|%s| num|%s| " "impactpct|%d| decayhalflife|%d| span|%d| maxlife|%d| gtotal|%s|" % (myfNow, self.sShelfID, self.nGlitches, self.nImpactReductionPct, self.nGlitchDecayHalflife, self.nGlitchSpan, self.nGlitchMaxlife, G.nGlitchesTotal)) self.fGlitchBegin = float(G.env.now) NTRC.tracef( 3, "LIFE", "proc happens1 t|%.3f| shelf|%s| num|%s| impact|%d| " "decayhalflife|%d| span|%d| maxlife|%d|" % (myfNow, self.sShelfID, self.nGlitches, self.nImpactReductionPct, self.nGlitchDecayHalflife, self.nGlitchSpan, self.nGlitchMaxlife)) ''' If this is a 100% glitch: - Declare server, not just shelf, to be dead. - Auditor will eventually discover the problem and call client to inform that server is dead. ''' sServerID = self.cShelf.sServerID if G.dID2Server[sServerID].bDead or self.nImpactReductionPct == 100: self.cShelf.bAlive = False #sServerID = self.cShelf.sServerID cServer = G.dID2Server[sServerID] NTRC.ntracef( 3, "LIFE", "proc happens2 glitch 100pct or server dead " "id|%s| shelf|%s| svr|%s|" % (self.ID, self.cShelf.ID, sServerID)) cServer.mServerDies() NTRC.ntracef( 3, "LIFE", "proc happens3 life|%s| killed server |%s|" % (self.ID, sServerID)) lg.logInfo( "LIFETIME", "100pct glitch on shelf |%s| " "of server|%s| - all docs lost" % (self.sShelfID, sServerID)) else: self.mInjectError(self.nImpactReductionPct, self.nGlitchDecayHalflife, self.nGlitchMaxlife) return (self.nGlitches, self.sShelfID)
def makeServers(mydServers): for sServerName in mydServers: (nServerQual, nShelfSize) = mydServers[sServerName][0] cServer = server.CServer(sServerName, nServerQual, nShelfSize) sServerID = cServer.ID G.lAllServers.append(cServer) fCurrentLife = cServer.mfGetMyCurrentLife() lg.logInfo( "MAIN", "created server|%s| quality|%s| shelfsize|%s|TB " "name|%s| life|%.0f|" % (sServerID, nServerQual, nShelfSize, sServerName, fCurrentLife)) # Invert the server list so that clients can look up # all the servers that satisfy a quality criterion. if nServerQual in G.dQual2Servers: G.dQual2Servers[nServerQual].append([sServerName, sServerID]) else: G.dQual2Servers[nServerQual] = [[sServerName, sServerID]] NTRC.ntracef( 5, "SVRS", "proc makeServers dQual2Servers qual|%s| servers|%s|" % (nServerQual, G.dQual2Servers[nServerQual])) return G.dQual2Servers
def mServerIsDead(self, mysServerID, mysCollID): '''\ Auditor calls us: a server is dead, no longer accepting documents. Remove server from active list, find a new server, populate it. ''' NTRC.ntracef(3, "CLI", "proc deadserver1 client|%s| place coll|%s| " "to|%d|servers" % (self.ID, mysCollID, len(self.lServersToUse))) lg.logInfo("CLIENT", "server died cli|%s| removed svr|%s| coll|%s| " % (self.ID, mysServerID, mysCollID)) cColl = G.dID2Collection[mysCollID] cColl.lServerIDs.remove(mysServerID) nCollValue = cColl.nValue lServersForCollection = self.mSelectServersForCollection(nCollValue) # The distribution params have already limited the # set of servers in the select-for-collection routine. # If there are servers available, pick one. Otherwise, # create a new server that's just like an old one and use it. if lServersForCollection: sServerToUse = lServersForCollection.pop(0) else: sServerToUse = CServer.fnsInventNewServer() lg.logInfo("CLIENT", "client|%s| assign new server|%s| to replace|%s|" % (self.ID, sServerToUse, mysServerID)) nDocs = self.mPlaceCollectionOnServer(mysCollID, sServerToUse) lg.logInfo("CLIENT", "client|%s| provisioned new server|%s| " "collection|%s| ndocs|%s|" % (self.ID, sServerToUse, mysCollID, nDocs)) self.nServerReplacements += 1 return sServerToUse
def mServerIsDead(self, mysServerID, mysCollID): '''\ Auditor calls us: a server is dead, no longer accepting documents. Remove server from active list, find a new server, populate it. ''' NTRC.ntracef(3, "CLI", "proc deadserver1 client|%s| place coll|%s| " "to|%d|servers" % (self.ID, mysCollID, len(self.lServersToUse))) lg.logInfo("CLIENT", "server died cli|%s| removed svr|%s| coll|%s| " % (self.ID, mysServerID, mysCollID)) cColl = G.dID2Collection[mysCollID] cColl.lServerIDs.remove(mysServerID) nCollValue = cColl.nValue lServersForCollection = self.mSelectServersForCollection(nCollValue) # The distribution params have already limited the # set of servers in the select-for-collection routine. # If there are servers available, pick one. Otherwise, # create a new server that's just like an old one and use it. if lServersForCollection: sServerToUse = lServersForCollection.pop(0) else: sServerToUse = CServer.fnsInventNewServer() lg.logInfo("CLIENT", "client|%s| assign new server|%s| to replace|%s|" % (self.ID, sServerToUse, mysServerID)) nDocs = self.mPlaceCollectionOnServer(mysCollID, sServerToUse) lg.logInfo("CLIENT", "client|%s| provisioned new server|%s| " "collection|%s| ndocs|%s|" % (self.ID, sServerToUse, mysCollID, nDocs)) self.nServerReplacements += 1 return sServerToUse
def testAllClients(mylClients): for cClient in mylClients: lDeadDocIDs = cClient.mTestClient() sClientID = cClient.ID if len(lDeadDocIDs) > 0: if G.bShortLog: G.bDoNotLogInfo = True for sDocID in lDeadDocIDs: cDoc = G.dID2Document[sDocID] lg.logInfo( "MAIN", "client |%s| lost doc|%s| size|%s|" % (sClientID, sDocID, cDoc.nSize)) G.bDoNotLogInfo = False lg.logInfo( "MAIN", "BAD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID, len(lDeadDocIDs))) else: lg.logInfo( "MAIN", "GOOD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID, len(lDeadDocIDs))) # Now log stats for the all collections in the client. lCollectionIDs = cClient.mListCollectionIDs() for sCollID in lCollectionIDs: dumpuse.dumpCollectionStats(sCollID)
def mAge_shelf(self, mynLifeParam): ''' An entire shelf fails. Remove all the docs it contained. Eventually, this will trigger a repair event and make the collection more vulnerable during the repair. ''' fShelfLife = util.makeexpo(mynLifeParam) lg.logInfo("SERVER", "mAge_shelf set lifetime time|%6.0f| shelf|%s| " "next lifetime|%.3f|khr" % (G.env.now,self.ID,fShelfLife)) NTRC.tracef(3, "SHLF", "proc mAge_shelf time|%6.0f| shelf|%s| " "next lifetime|%.3f|khr" % (G.env.now,self.ID,fShelfLife)) yield G.env.timeout(fShelfLife) # S H E L F F A I L S G.nTimeLastEvent = G.env.now self.bAlive = False # Shelf can no longer be used to store docs. NTRC.tracef(3, "SHLF", "proc mAge_shelf time|%d| shelf|%s| shelf_error" % (G.env.now,self.ID)) lg.logInfo("SERVER", "storage shelf failed time|%6.0f| server|%s| " "shelf|%s| lost |%d| docs" % (G.env.now,self.sServerID,self.ID,len(self.lCopyIDs))) # This whole shelf is a goner. Kill it. NTRC.tracef(5, "SHLF", "proc mAge_shelf kill contents ldocs|%s| " "lcopies|%s|" % (self.lDocIDs,self.lCopyIDs)) # Note that we have to copy the list before modifying it and # iterate over the copy of the list. # Standard problem with updating an iterable inside the for loop. templCopyIDs = copy.deepcopy(self.lCopyIDs) for sCopyID in templCopyIDs: sDocID = G.dID2Copy[sCopyID].sDocID self.mDestroyCopy(sCopyID) # G.dID2Server[self.sServerID].mDestroyDocument(sDocID,self.ID) G.dID2Server[self.sServerID].mDestroyCopy(sCopyID,sDocId,self.ID) self.mReportDocumentLost(sDocID) NTRC.tracef(3, "FAIL", "proc t|%d| shelf failure server|%s| qual|%d| " "shelf|%s| docs|%d|" % (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual, self.ID,len(templCopyIDs)))
def mWaitForShockToHappen(self, mynHalflife): ''' Generator that waits for shock event. Infinite loop: - Schedule shock event - Execute shock event - Schedule end of shock (maybe infinite) - Execute end of shock ''' while True: # Schocks happen every so often, not just once. fNewLife = util.makeshocklife(mynHalflife) lg.logInfo("SHOCK ", "t|%6.0f| waiting for shock in|%.0f| " "from hl|%s| at|%.0f|" % (G.env.now, fNewLife, mynHalflife, (G.env.now+fNewLife))) # Suspend action until shock happens. yield G.env.timeout(fNewLife) # Shock has happened. lg.logInfo("SHOCK ", "t|%6.0f| shock happens now, maxlife|%s|" % (G.env.now, self.nMaxlife)) self.mShockHappens() # If maxlife nonzero, then wait and expire shock; # else, never expires, so wait forever and don't # start another shock cycle. if self.nMaxlife > 0: lg.logInfo("SHOCK ", "t|%6.0f| waiting for shock to expire " "in|%.0f| at|%.0f|" % (G.env.now, self.nMaxlife, (G.env.now+self.nMaxlife))) yield G.env.timeout(self.nMaxlife) self.mShockExpires() else: yield G.env.timeout(G.fInfinity)
def mReduceSomeServerLifetimes(self, mynSpan, mynImpact): ''' Find a shockspan-wide subset of servers and reduce their expected lifetimes by the stated reduction percentage. ''' lServersToShock = server.CServer.fnlSelectServerVictims(mynSpan) fReduction = mynImpact * 1.0 / 100.0 NTRC.ntracef( 3, "SHOK", "proc reduce servers|%s| by|%s|" % (lServersToShock, fReduction)) for sServerID in lServersToShock: lg.logInfo( "SHOCK ", "t|%6.0f| reduce svr|%s| life by pct|%s|" % (G.env.now, sServerID, self.nImpact)) cServer = G.dID2Server[sServerID] fOriginalLife = float(cServer.mfGetMyOriginalLife()) if fOriginalLife > 0: self.mReduceSingleServerLifetime(sServerID, fReduction) self.lsServersShocked.append(sServerID) else: lg.logInfo( "SHOCK ", "t|%6.0f| cannot reduce svr|%s| life|%.0f|" % (G.env.now, sServerID, fOriginalLife))
def testAllClients(mylClients): for cClient in mylClients: lDeadDocIDs = cClient.mTestClient() sClientID = cClient.ID if len(lDeadDocIDs) > 0: if G.bShortLog: G.bDoNotLogInfo = True for sDocID in lDeadDocIDs: cDoc = G.dID2Document[sDocID] lg.logInfo("MAIN","client |%s| lost doc|%s| size|%s|" % (sClientID,sDocID,cDoc.nSize)) G.bDoNotLogInfo = False lg.logInfo("MAIN","BAD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID,len(lDeadDocIDs))) else: lg.logInfo("MAIN","GOOD NEWS: Total documents lost by client |%s| in all servers |%d|" % (sClientID,len(lDeadDocIDs))) # Now log stats for the all collections in the client. lCollectionIDs = cClient.mListCollectionIDs() for sCollID in lCollectionIDs: dumpuse.dumpCollectionStats(sCollID)
def mWaitForShockToHappen(self, mynHalflife): ''' Generator that waits for shock event. Infinite loop: - Schedule shock event - Execute shock event - Schedule end of shock (maybe infinite) - Execute end of shock ''' while True: # Schocks happen every so often, not just once. fNewLife = util.makeshocklife(mynHalflife) lg.logInfo( "SHOCK ", "t|%6.0f| waiting for shock in|%.0f| " "from hl|%s| at|%.0f|" % (G.env.now, fNewLife, mynHalflife, (G.env.now + fNewLife))) # Suspend action until shock happens. yield G.env.timeout(fNewLife) # Shock has happened. lg.logInfo( "SHOCK ", "t|%6.0f| shock happens now, maxlife|%s|" % (G.env.now, self.nMaxlife)) self.mShockHappens() # If maxlife nonzero, then wait and expire shock; # else, never expires, so wait forever and don't # start another shock cycle. if self.nMaxlife > 0: lg.logInfo( "SHOCK ", "t|%6.0f| waiting for shock to expire " "in|%.0f| at|%.0f|" % (G.env.now, self.nMaxlife, (G.env.now + self.nMaxlife))) yield G.env.timeout(self.nMaxlife) self.mShockExpires() else: yield G.env.timeout(G.fInfinity)
def mReduceSingleServerLifetime(self, mysServerID, myfReduction): ''' Reduce the lifetime of a single server. Two possible methods, selected by a globaldata const nShockType. - 1: lifetime, which was already a random from a distribution with the standard server half-life, is then reduced by some percentage during the shock period. - 2: lifetime during the shock period is a new random chosen from a distribution with half-life reduced *from its current lifetime* by the shock percentage. ''' cServer = G.dID2Server[mysServerID] fCurrentLife = cServer.mfGetMyCurrentLife() fOriginalLife = cServer.mfGetMyOriginalLife() # Hack to experiment with the two types of shock to see if they # are statistically different. if G.nShockType == 1: # Type 1: Lifetime during the shock period is the # reduction of the original lifetime by the given # percentage. # That is, the server gets a single life expectation at # birth, and it may be reduced by a shock and then # restored at the end of the shock period, provided # that it has not expired during the shock period. fNewLifeParam = (1.0 - myfReduction) * fCurrentLife # Lifetime cannot actually be zero for 100% reduction, so # make it just really, really small, like 2 hours. fNewLifeParam = max(fNewLifeParam, 2.0) NTRC.ntracef(3, "SHOK", "proc shock1 at t|%8.0f| svr|%s| new" "lifeparam|%.0f| shocktype|%s|" % (G.env.now, mysServerID, fNewLifeParam, G.nShockType)) fNewLife = fNewLifeParam elif G.nShockType == 2: # Type 2: lifetime during shock period is a new # random chosen from a distribution with less than the lifetime # of the old one. fNewLifeParam = (1.0 - myfReduction) * fOriginalLife # Lifetime cannot actually be zero for 100% reduction, so # make it just really, really small, like 2 hours. fNewLifeParam = max(fNewLifeParam, 2.0) NTRC.ntracef(3, "SHOK", "proc shock1 at t|%8.0f| svr|%s| new" "lifeparam|%.0f| shocktype|%s|" % (G.env.now, mysServerID, fNewLifeParam, G.nShockType)) fNewLife = util.makeserverlife(fNewLifeParam) else: NTRC.ntrace(0, "SHOK", "proc ERROR at t|%8.0f| svr|%s| " "unknown shock type|%s|" % (G.env.now, mysServerID, G.nShockType)) # Should throw a bugcheck fatal error at this point. NTRC.ntracef(3, "SHOK", "proc shock2 at t|%8.0f| svr|%s| new" "life|%.0f| shocktype|%s|" % (G.env.now, mysServerID, fNewLife, G.nShockType)) lg.logInfo("SHOCK ", "t|%6.0f| reduce svr|%s| life by|%s| from|%.0f| to" "|%.0f| shocktype|%s|" % (G.env.now, mysServerID, myfReduction, fOriginalLife, fNewLife, G.nShockType)) cServer.mRescheduleMyLife(fNewLife) cServer.mSetServerInShock(True) return
def mKillShelf(self): ''' Declare shelf el croako and empty it of documents. ''' lg.logInfo("SHELF ", "t|%6.0f| kill storage shelf|%s| of server|%s|" % (G.env.now, self.ID, self.sServerID)) self.bAlive = False self.mDestroyShelf()
def mAuditSegment(self, mynThisSegment, mylDocs, mysCollectionID, myeCallerSyncEvent): '''\ SimPy generator to audit one segment of a collection. This does all the work. This is the single worst, most confusing, most fragile, and most awful code in the entire program. Unfortunately, in Python 2, one cannot yield from a vanilla function, only from a generator, so all that crap, and its convoluted conditional logic, is in here. *This* is the meanest, nastiest, ugliest father-raper of them all. ''' lg.logInfo("AUDIT2", "begin segmt t|%10.3f| auditid|%s| cycle|%s| " "seg|%s| cli|%s| coll|%s| ndocs|%s|range %s-%s|" % (G.env.now, self.ID, self.nNumberOfCycles, mynThisSegment, self.sClientID, self.sCollectionID, len(mylDocs), mylDocs[0], mylDocs[-1])) ###seize network resource # Seize the network resource so this audit cycle # can use it exclusively. # The "with" should take care of releasing it cClient = G.dID2Client[self.sClientID] with cClient.NetworkBandwidthResource.request() as reqnetwork: fNetworkWaitBegin = G.env.now ###wait if necessary result = yield reqnetwork # Wait for network to be free. fNetworkWaitEnd = G.env.now fNetworkWaitTime = fNetworkWaitEnd - fNetworkWaitBegin ###log result # Log event if we had to wait, or not, for the network to be free. lg.logInfo("AUDIT2", "grabnetwork t|%10.3f| auditid|%s| cli|%s| " "coll|%s| seg|%s| delay|%9.3f|" % (G.env.now, self.ID, self.sClientID, self.sCollectionID, mynThisSegment, fNetworkWaitTime)) # And restart the duration clock after the unproductive wait. fTimeCycleBegin = G.env.now # So much for timekeeping. Now do some actual work. # P h a s e 0: Check to see if any servers have died of old age, # possibly from being weakened by shock. If so, they get killed # now so that this audit segment will discover the loss. nResult = CShock.cmBeforeAudit() # P h a s e 1: Check servers for copies of docs, record losses. # Docs already permanently lost will not be put on the damaged list. self.dlDocsDamagedOnServers = cc.defaultdict(list) cCollection = G.dID2Collection[mysCollectionID] # foreach server used for this collection for sServerID in cCollection.lServerIDs: cServer = G.dID2Server[sServerID] ###foreach doc # foreach doc in this segment for sDocID in self.lDocsThisSegment: cDoc = G.dID2Document[sDocID] # If the doc is still on the server, retrieve it # and spend time doing that. # If not, then record that doc damaged on this server. fTransferTime = self.mRetrieveDoc(sDocID,sServerID) ###if okay if fTransferTime: NTRC.tracef(3, "AUD2", "proc AuditSegment3 retrieve " "t|%10.3f| doc|%s| svr|%s| xfrtim|%f|" % (G.env.now, sDocID, sServerID, fTransferTime)) ###yield timeout yield G.env.timeout(fTransferTime) else: if self.mIsDocumentLost(sDocID): pass # Do not complain if doc already known to be lost. else: # If copy is missing here, save server in # lost-list for doc. self.dlDocsDamagedOnServers[sDocID].append(sServerID) NTRC.tracef(5, "AUD2", "proc AuditSegment2 doc|%s| " "svr|%s| lost on|%s|" % (sDocID, sServerID, self.dlDocsDamagedOnServers[sDocID])) ###log copy missing on some server lg.logInfo("AUDIT2", "copymissing t|%10.3f| " "doc|%s| svr|%s| aud|%s-c%s-s%s| cli|%s| " "coll|%s|" % (G.env.now, sDocID, sServerID, self.ID, self.nNumberOfCycles, mynThisSegment, self.sClientID, self.sCollectionID)) # end foreach doc # end foreach server used for collection '''NOTE: Phase 2 here can be factored out of this function entirely because it does not yield or otherwise molest the clock. But refactoring must be done carefully because it consumes and supplies data from phases 1 and 3. ''' # P h a s e 2: Record severity (majority/minority/permanent) of copy losses. # NOTE: This arithmetic seems to be reasonable for all numbers # greater than two, but one copy remaining out of two is judged # to be a majority, so a repair from that single remaining copy # is labeled a majority repair. Seems kinda wrong. # Would love to split the logic of this routine into separate # functions; when you're indented seven levels, your logic is, # um, hard to explain. But we cannot yield from sub-functions, # at least not in Python2. nServers = len(cCollection.lServerIDs) nMajority = (len(cCollection.lServerIDs)+1) / 2 # recall that # int div truncates ###foreach doc on damaged list for sDocID in sorted(self.dlDocsDamagedOnServers.keys(), key=util.fniNumberFromID): ###count docs on all servers lDocLostOnServers = self.dlDocsDamagedOnServers[sDocID] nCopiesLost = len(lDocLostOnServers) nCopiesLeft = nServers - nCopiesLost # How many copies left: none, a lot, a few? NTRC.tracef(3, "AUD2", "proc AuditSegment1 doc|%s| nsvr|%s| " "loston|%s| nleft|%s|" % (sDocID, nServers, lDocLostOnServers, nCopiesLeft)) ###if doc not lost ### assess majority/minority/lost if nCopiesLeft == 0: # N O N E remain # Report permanent loss, one ping only. # Do not double-count docs already lost. Doc will not # be put onto damaged list if already lost. sRepair = "permloss" lg.logInfo("AUDIT2", "perm loss t|%10.3f| doc|%s| " "aud|%s-c%s-s%s| cli|%s| coll|%s|" % (G.env.now, sDocID, self.ID, self.nNumberOfCycles, mynThisSegment, self.sClientID, self.sCollectionID)) self.mRecordDocumentLost(sDocID) else: ###doc is repairable; determine majority/minority if nCopiesLeft >= nMajority: # M A J O R I T Y remain sRepair = "majority" else: # M I N O R I T Y remain sRepair = "minority" ###log repair type for doc lg.logInfo("AUDIT2", "%s rp t|%10.3f| doc|%s| " "aud|%s-c%s-s%s| cli|%s| coll|%s|" % (sRepair, G.env.now, sDocID, self.ID, self.nNumberOfCycles, mynThisSegment, self.sClientID, self.sCollectionID)) # P h a s e 3: repair damaged docs, if possible. ###foreach server on which doc was damaged # Put a copy back on each server where it is missing. for sServerID in lDocLostOnServers: if nCopiesLeft > 0: ###repair fTransferTime = self.mRepairDoc(sDocID,sServerID) '''\ If the repair returns False instead of a time, then that server is no longer accepting documents. Remove that server from the list, invalidate all its copies. Then tell the client to find a new server and re-place the entire collection. Schedule this notification to occur at the end of the audit cycle or segment to avoid confusing the ongoing evaluation. Auditor informs client: oops, you seem to be missing a server; and client takes corrective action at that time. Send collectionID and serverID to clientID. ''' ###if not okay ie server dead if fTransferTime == False: self.stDeadServerIDs.add((sServerID, self.sCollectionID)) lg.logInfo("AUDIT2", "dead server t|%10.3f| " "doc|%s| aud|%s| cli|%s| coll|%s| svr|%s|" % (G.env.now, sDocID, self.ID, self.sClientID, self.sCollectionID, sServerID)) else: ###log repair effected NTRC.tracef(3, "AUD2", "proc AuditSegment4 repair " "t|%10.3f| doc|%s| svr|%s| xfrtim|%f| type|%s|" % (G.env.now, sDocID, sServerID, fTransferTime, sRepair)) yield G.env.timeout(float(fTransferTime)) lg.logInfo("AUDIT2", "repair doc t|%10.3f| " "doc|%s| aud|%s| cli|%s| coll|%s| svr|%s| " "from %s copies|%d|" % (G.env.now, sDocID, self.ID, self.sClientID, self.sCollectionID, sServerID, sRepair, nCopiesLeft)) ###count repair as type maj/min for audit and doc # If repair succeeded, record and count it. if sRepair == "majority": self.mRecordDocumentMajorityRepair(sDocID) else: self.mRecordDocumentMinorityRepair(sDocID) # end foreach server that lost this doc # end foreach damaged doc lg.logInfo("AUDIT2", "end segmt t|%10.3f| auditid|%s| " "cycle|%s| seg|%s| cli|%s| coll|%s| ndocs|%s|" % (G.env.now, self.ID, self.nNumberOfCycles, mynThisSegment, self.sClientID, self.sCollectionID, len(mylDocs))) # After all that, tell the caller we finished. myeCallerSyncEvent.succeed(value=mynThisSegment) lg.logInfo("AUDIT2", "rls network t|%10.3f| auditid|%s| " "cli|%s| coll|%s| seg|%s|" % (G.env.now, self.ID, self.sClientID, self.sCollectionID, mynThisSegment)) # end network resource # If we saw any dead servers during this segment, inform the clients. for (sDeadServerID, sDeadCollectionID) in self.stDeadServerIDs: cCollection = G.dID2Collection[self.sCollectionID] cClient = G.dID2Client[cCollection.sClientID] NTRC.ntracef(3, "AUD2", "proc t|%10.3f| inform dead server " "auditid|%s| cli|%s| coll|%s| svr|%s| doc|%s|" % (G.env.now, self.ID, self.sClientID, self.sCollectionID, sServerID, sDocID)) cClient.mServerIsDead(sDeadServerID, sDeadCollectionID) self.stDeadServerIDs = set()
def mAge_sector(self): ''' A sector in the shelf fails. This corrupts a document. For the moment, assume that it destroys the document. Eventually, it will have a probability of destroying the document depending on the portion of the document corrupted and the sensitivity of the document to corruption (e.g., compressed or encrypted), or the failure hits an encryption or license key. ''' # If the shelf has been emptied by a shelf failure, stop # caring about sector failures. while self.bAlive: # Sector lifetime depends on shelf lifetime and glitch age. fNow = G.env.now cLifetime = G.dID2Lifetime[self.sSectorLifetimeID] fLifetimeNow = cLifetime.mfCalcCurrentSectorLifetime(fNow) fSectorLifeInterval = util.makeexpo(fLifetimeNow) NTRC.tracef(3, "SHLF", "proc mAge_sector time|%d| shelf|%s| " "next interval|%.3f|hr from life rate|%.3f|hr" % (G.env.now, self.ID, fSectorLifeInterval, fLifetimeNow)) yield G.env.timeout(fSectorLifeInterval) # S E C T O R E R R O R self.nSectorHits += 1 G.nTimeLastEvent = G.env.now NTRC.tracef(3, "SHLF", "proc mAge_sector time|%d| shelf|%s| " "Sector_error hits|%d| emptyhits|%d|" % (G.env.now, self.ID, self.nSectorHits, self.nEmptySectorHits)) # Select a victim Document, probability proportional to size. # Small error, size=1. What doc dies as a result? sCopyVictimID = self.mSelectVictimCopy(mynErrorSize=1) # New version: compress strings of consecutive misses into single line. # Normally we log one line per error regardless of whether it hits or # misses a document. That results in hideously long log files for # sparse storage structures, like small docs on large shelf. # Count consecutive misses, and issue one summary line before the # next hit. # CANDIDATE FOR REFACTORING if sCopyVictimID: # Hidden error in victim doc. # Destroy copy on this shelf. cCopy = G.dID2Copy[sCopyVictimID] sDocID = cCopy.mGetDocID() self.mDestroyCopy(sCopyVictimID) # Log the summary line if we just ended a string of misses if self.nConsecutiveMisses > 0: lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| " "shelf|%s| consecutive misses|%d|" % (G.env.now, self.sServerID, self.ID, self.nConsecutiveMisses)) self.nConsecutiveMisses = 0 lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| " "shelf|%s| hidden failure in copy|%s| doc|%s|" % (G.env.now,self.sServerID,self.ID,sCopyVictimID,sDocID)) NTRC.tracef(3, "FAIL", "proc t|%d| sector failure server|%s| " "qual|%d| shelf|%s| doc|%s| copy|%s|" % (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual, self.ID, sDocID, sCopyVictimID)) else: # No victim, hit empty space. self.nEmptySectorHits += 1 NTRC.tracef(3, "SHLF", "proc mAge_sector shelf|%s| " "sector error fell in empty space" % (self.ID)) if self.nConsecutiveMisses == 0: lg.logInfo("SERVER", "small error t|%6.0f| svr|%s| " "shelf|%s| hidden failure in copy|%s|" % (G.env.now, self.sServerID, self.ID, sCopyVictimID)) self.nConsecutiveMisses += 1 NTRC.tracef(3, "FAIL", "proc t|%d| sector failure server|%s| " "qual|%d| shelf|%s| copy|%s|" % (G.env.now, self.sServerID, G.dID2Server[self.sServerID].nQual, self.ID, sCopyVictimID)) # Initiate a repair of the dead document. # BZZZT NYI: currently all such failures are silent, so they are # not detected by the client until audited (or end of run). # Shelf is no longer alive, so we do not notice or schedule # future sector errors. Log the event. lg.logInfo("SHELF ", "t|%6.0f| dead shelf|%s| of svr|%s|, " "no future errors" % (G.env.now, self.ID, self.sServerID))
def makeClients(mydClients): for sClientName in mydClients: cClient = client.CClient(sClientName,mydClients[sClientName]) G.lAllClients.append(cClient) lg.logInfo("MAIN","created client|%s|" % (cClient.ID)) return G.lAllClients
def dumpShockStats(): lg.logInfo("MAIN","SHOCKS Total shocks|%d| deaths due to shock|%d: %s|" % (G.nShocksTotal, G.nDeathsDueToShock, G.lDeathsDueToShock)) lg.logInfo("MAIN","SHOCKS When nshocks|%d| at times|%s|" % (G.nShocksTotal, G.lShockTimes))
def mIdentifySegment(self, mysCollectionID, mynSegments, iCurrentSegment): # Get list of document IDs in the collection cCollection = G.dID2Collection[mysCollectionID] lDocIDsRemaining = cCollection.mListDocumentsRemaining() nDocsRemaining = len(lDocIDsRemaining) # Beware the case where there are fewer docs remaining alive # than are normally called for in the segment. nDocsMaybe = min( self.mCalcSegmentSize(mysCollectionID, mynSegments), nDocsRemaining ) if 1: # Which method do we choose today? # TRUE = generous assumptions, each segment will audit the # segment's size number of unique documents WITHOUT REPLACEMENT. # FALSE = naive assumptions, each segment will choose the # segment's size number of document WITH REPLACEMENT. # Use set() to ensure that a doc is not sampled twice in a segment. # This is sampling WITHOUT REPLACEMENT FOR A SEGMENT ONLY, to # ensure that the right portion of the population gets audited. # (I don't know what people who say, "audit one quarter of the # population every quarter" actually mean. This is a plausible # but optimistic guess.) # Note that the cycle is still sampled WITH repacement overall, so # that some docs may be missed, say, quarter to quarter. # For sampling with replacement, use a list and just append doc IDs # to the list. Then a doc might be sampled > once per segment, # which would be even more useless. # Also, beware the case where there are no docs remaining. # Carefully return an empty list. setDocsThisSegment = set() while nDocsMaybe > 0 and len(setDocsThisSegment) < nDocsMaybe: idxChoose = int(util.makeunif(0, nDocsRemaining)) sDocID = lDocIDsRemaining[idxChoose] cDoc = G.dID2Document[sDocID] # If the doc is not already permanently kaput, check it. if not cDoc.mIsLost(): setDocsThisSegment.add(sDocID) lDocsThisSegment = util.fnlSortIDList(list(setDocsThisSegment)) sRandomType = "GENEROUS" else: # Do this the simple, stupid way: select a segment's size list # of uniform random numbers and make that the audit list for # this segment. Note that this will be SAMPLING WITH REPLACEMENT # WITHIN A SEGMENT, so that the number of documents actually # audited during a segment will (almost certainly) be much less # than the naively-desired number due to balls-in-urns effects # of replacement. I think that this is what most people # would actually implement naively for "random auditing." # The list may/will contain duplicates, and we don't care. lDocsThisSegment = [] while nDocsMaybe > 0 and len(lDocsThisSegment) < nDocsMaybe: idxChoose = int(util.makeunif(0, nDocsRemaining)) sDocID = lDocIDsRemaining[idxChoose] cDoc = G.dID2Document[sDocID] # If the doc is not already permanently kaput, check it. if not cDoc.mIsLost(): lDocsThisSegment.append(sDocID) lDocsThisSegment = util.fnlSortIDList(list(set(lDocsThisSegment))) sRandomType = "NAIVE" lg.logInfo("AUDIT2", "choose seg t|%10.3f| auditid|%s| seg|%s|of|%s| " "ndocsrem|%s| chosen|%s| type|%s|" % (G.env.now, self.ID, iCurrentSegment+1, mynSegments, nDocsRemaining, len(lDocsThisSegment), sRandomType)) return lDocsThisSegment