def loop(self, duration): gauge = Gauge.get('scheduler.latency') gauge.start_timer() until = duration + clock.get_time() # Runs for the specified amount of time. while clock.get_time() < until: # Figures out when the first scheduled action is and advances the clock # until then. now = clock.get_time() t = min(self._first_time(), until) clock.set_time(t) # If there are scheduled items to be done right now, do them. if t in self.schedule: # Makes a copy of the list of actions that we need to execute and clears # out the schedule for this timestamp. This will allow actions to schedule # something for the current time. actions = list(self.schedule[t]) del self.schedule[t] # Executes all the actions we need to execute now. for task in actions: if type(task[0]) == types.LambdaType: task[0]() else: task[0].execute(task[1]) # And continue any threads which need to continue. for (thread, timestamp) in self.thread_map.iteritems(): if timestamp <= t: self.update_thread(thread, thread.thread_continue()) # Stop the timer gauge.stop_timer() # Runs the finalizers. logger.info('Running finalizers') for target in self.finalizers: if type(target) == types.LambdaType: target() else: target.finalize() # Ends the simulation. logger.info('End of simulation. It took %lf seconds' % gauge.get_value())
def create_lease(self, resource, capacity): now = clock.get_time() # Creates the lease object and sets the essential properties. lease = CapacityLease() lease.capacity = capacity lease.refresh_interval = self.get_refresh_interval() # The server cannot give out a lease that lasts longer than # it itself has capacity for from a lower level. if resource.HasField('has'): lease.expiry_time = min( resource.has.expiry_time, now + self.lease_duration_secs) else: lease.expiry_time = now + self.lease_duration_secs # Edge case: If we are near the end of the lease we must make sure # that the refresh interval does not tell the client to do a refresh # after the resource has expired. If that happens, adjust the # refresh interval to just before the end of the lease expiration. if now + lease.refresh_interval >= lease.expiry_time: lease.refresh_interval = lease.expiry_time - now - 1 assert lease.IsInitialized() return lease
def create_lease(self, resource, capacity): now = clock.get_time() # Creates the lease object and sets the essential properties. lease = CapacityLease() lease.capacity = capacity lease.refresh_interval = self.get_refresh_interval() # The server cannot give out a lease that lasts longer than # it itself has capacity for from a lower level. if resource.HasField('has'): lease.expiry_time = min(resource.has.expiry_time, now + self.lease_duration_secs) else: lease.expiry_time = now + self.lease_duration_secs # Edge case: If we are near the end of the lease we must make sure # that the refresh interval does not tell the client to do a refresh # after the resource has expired. If that happens, adjust the # refresh interval to just before the end of the lease expiration. if now + lease.refresh_interval >= lease.expiry_time: lease.refresh_interval = lease.expiry_time - now - 1 assert lease.IsInitialized() return lease
def find_resource(self, resource_id): # This can only happen in the master! assert self.server.is_master() # Lineair scan through all the resources in the state to find this # resource. for r in self.wrapped_state.resource: if r.resource_id == resource_id: return r # The resource was not found. Find the resource template which # describes this resource. t = global_config.find_resource_template(resource_id) # No template found. if not t: logger.error( 'Cannot create server state entry for resource %s (no template)' % resource_id) return None # We got the template. Now create a new blank resource entry in the server # state. logger.info( '%s creating new resource %s' % (self.get_server_id(), resource_id)) r = self.wrapped_state.resource.add() r.template.CopyFrom(t) r.resource_id = resource_id # Calculates the time this resource went out of learning mode. # Note: This time may be in the past. r.learning_mode_expiry_time = self.wrapped_state.election_victory_time + \ AlgorithmImpl.create(t, self.wrapped_state.server_level).get_max_lease_duration() if r.learning_mode_expiry_time > clock.get_time(): logger.info( '%s putting resource %s in learning mode until T=%d' % (self.get_server_id(), resource_id, r.learning_mode_expiry_time)) # Schedules an action to log a message when this resource leaves # learning mode. scheduler.add_absolute( r.learning_mode_expiry_time + 1, _LeaveLearningMode(), (self.get_server_id(), r.resource_id)) # Note: At this point this server has not capacity lease for this resource. # It is up to the caller to deal with this. assert r.IsInitialized() return r
def add_absolute(self, time, target, arg=None): if time < clock.get_time(): logger.warning('Scheduling action in the past!') if type(target) == types.LambdaType and not arg is None: logger.warning('Non-None argument ignored for lambda callback') # Adds the schedulable item (a target and argument tuple) to the # schedule. item = (target, arg) self.schedule.setdefault(time, list()) self.schedule[time].append((target, arg)) return time
def find_resource(self, resource_id): # This can only happen in the master! assert self.server.is_master() # Lineair scan through all the resources in the state to find this # resource. for r in self.wrapped_state.resource: if r.resource_id == resource_id: return r # The resource was not found. Find the resource template which # describes this resource. t = global_config.find_resource_template(resource_id) # No template found. if not t: logger.error( 'Cannot create server state entry for resource %s (no template)' % resource_id) return None # We got the template. Now create a new blank resource entry in the server # state. logger.info('%s creating new resource %s' % (self.get_server_id(), resource_id)) r = self.wrapped_state.resource.add() r.template.CopyFrom(t) r.resource_id = resource_id # Calculates the time this resource went out of learning mode. # Note: This time may be in the past. r.learning_mode_expiry_time = self.wrapped_state.election_victory_time + \ AlgorithmImpl.create(t, self.wrapped_state.server_level).get_max_lease_duration() if r.learning_mode_expiry_time > clock.get_time(): logger.info('%s putting resource %s in learning mode until T=%d' % (self.get_server_id(), resource_id, r.learning_mode_expiry_time)) # Schedules an action to log a message when this resource leaves # learning mode. scheduler.add_absolute(r.learning_mode_expiry_time + 1, _LeaveLearningMode(), (self.get_server_id(), r.resource_id)) # Note: At this point this server has not capacity lease for this resource. # It is up to the caller to deal with this. assert r.IsInitialized() return r
def _get_capacity(self): assert self.is_master() now = clock.get_time() # Assume the worst... :-) success = False # If we are server level 0, we need to get the capacity from the # configuration. if self.server_level == 0: for resource in self.state.all_resources(): algo = AlgorithmImpl.create(resource.template, self.server_level) resource.ClearField('has') resource.has.CopyFrom( algo.create_lease(resource, resource.template.capacity)) # Note, we set a refresh interval here even though the capacity we get from the # configuration lasts forever. However by setting a refresh interval and relatively # short leases we ensure that configuration changes (e.g. from CDD) are # picked up. resource.has.refresh_interval *= 2 success = True else: # If this is not the root server it gets its capacity from # a downstream server. success = self._get_capacity_downstream() logger.info('%s resource state after getting capacity:' % self.server_id) for resource in self.state.all_resources(): logger.info('resource: %s got: %lf lease: %d refresh: %d' % (resource.resource_id, resource.has.capacity, resource.has.expiry_time - now, resource.has.refresh_interval)) return success
def _get_capacity(self): assert self.is_master() now = clock.get_time() # Assume the worst... :-) success = False # If we are server level 0, we need to get the capacity from the # configuration. if self.server_level == 0: for resource in self.state.all_resources(): algo = AlgorithmImpl.create(resource.template, self.server_level) resource.ClearField('has') resource.has.CopyFrom( algo.create_lease(resource, resource.template.capacity)) # Note, we set a refresh interval here even though the capacity we get from the # configuration lasts forever. However by setting a refresh interval and relatively # short leases we ensure that configuration changes (e.g. from CDD) are # picked up. resource.has.refresh_interval *= 2 success = True else: # If this is not the root server it gets its capacity from # a downstream server. success = self._get_capacity_downstream() logger.info('%s resource state after getting capacity:' % self.server_id) for resource in self.state.all_resources(): logger.info( 'resource: %s got: %lf lease: %d refresh: %d' % (resource.resource_id, resource.has.capacity, resource.has.expiry_time - now, resource.has.refresh_interval)) return success
def set_election_victory_time(self): self.wrapped_state.election_victory_time = clock.get_time()
def in_learning_mode(resource): return resource.learning_mode_expiry_time >= clock.get_time()
def cleanup(self): now = clock.get_time() # No need to do cleanup if the last cleanup happened in the same # second. if self.last_cleanup_time == now: return else: self.last_cleanup_time = now logger.info('%s cleanup' % self.get_server_id()) # This is the new resource list, pruned from all resources for # which the server does not have capacity. nrl = list() # First go through all the resources, and figure out which ones we # want to keep; these will be added to nrl. for r in self.wrapped_state.resource: # A resource that is still in learning mode might not have obtained # a lease yet but should not be cleaned. It is exempt from # cleaning. if in_learning_mode(r): logger.info('Not cleaning %s (in learning mode)' % r.resource_id) nrl.append(r) elif not lease_expired(r): # This resource is going to survive this cleanup step. nrl.append(r) # Go through the clients for this resource, and keep the ones # who still have an unexpired lease. They will be added to the # ncl. ncl = list() for c in r.client: if not lease_expired(c): ncl.append(c) else: logger.info( 'Removing client %s from resource %s (lease expired)' % (c.client_id, r.resource_id)) # Then go through the servers for this resource, and keep the # ones who still have an unexpired lease. They will be added # to the nsl. nsl = list() for s in r.server: if not lease_expired(s): nsl.append(s) else: logger.info( 'Removing server %s from resource %s (lease expired)' % (s.server_id, r.resource_id)) # Plug the new client and server list back into the resource. r.ClearField('client') r.client.extend(ncl) r.ClearField('server') r.server.extend(nsl) else: logger.info('Removing resource %s (lease expired)' % r.resource_id) # Now plug the new resource list back into the server state. self.wrapped_state.ClearField('resource') self.wrapped_state.resource.extend(nrl)
def add_relative(self, duration, target, arg=None): return self.add_absolute(clock.get_time() + duration, target, arg)
def update_thread(self, thread, interval): self.thread_map[thread] = clock.get_time() + interval
def lease_expired(thing): return thing and thing.HasField( 'has') and thing.has.expiry_time <= clock.get_time()
def gather_reporting_data(self, resource_id): logger.info('Gathering reporting data') now = clock.get_time() # Adds a record to the data set for this timestamp. self.data[now] = dict() self.summaries[now] = dict() # Adds a summary record for the clients p = ReportingData() p.total_wants = 0 p.total_has = 0 self.summaries[now]['clients'] = p self.all_summaries.add('clients') # Step 1: Goes through all the clients in the system, gets their # reporting data and adds it to the data set. for client in Client.all_clients(): client_id = client.get_client_id() self.all_clients.add(client_id) data = client.get_reporting_data(resource_id) if data: self.data[now][client_id] = data logger.debug('%s: %s' % (client_id, str(data))) p.total_wants += data.wants p.total_has += data.has else: logger.warning('No reporting data received from %s' % client_id) # Step 2: Find the master server of every job, get its reporting data # and add it to the data set. for job in ServerJob.all_server_jobs(): current_master = job.get_master() # If this job does not have a master then we got nothing to do. if not current_master: continue job_name = job.get_job_name() self.all_server_jobs.add(job_name) data = current_master.get_reporting_data(resource_id) if data: self.data[now][job_name] = data logger.debug('%s: %s' % (job_name, str(data))) key = 'level %d' % current_master.get_server_level() self.all_summaries.add(key) if not key in self.summaries[now]: p = ReportingData() p.total_wants = 0 p.total_has = 0 p.total_leases = 0 p.total_outstanding = 0 self.summaries[now][key] = p else: p = self.summaries[now][key] p.total_wants += data.wants p.total_has += data.has p.total_leases += data.leases p.total_outstanding += data.outstanding else: logger.warning( 'No reporting data received from %s' % current_master.get_server_id())
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource( request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField('last_request_time') and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = ( self.state.find_client_resource( request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response
def lease_expired(thing): return thing and thing.HasField('has') and thing.has.expiry_time <= clock.get_time()
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource(request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField( 'last_request_time' ) and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = (self.state.find_client_resource(request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response