def execute(self, dummy): old = w = self.resource.wants w += self.fraction * (1 - 2 * random.random()) * w if w < 0: w = 0 self.resource.wants = w logger.debug('%s changing wants from %lf to %lf' % (self.client_id, old, w)) scheduler.add_relative(self.interval, self) Gauge.get('client.%s.wants' % self.client_id).set(w)
def Discovery_RPC(self, request): assert request.IsInitialized() timer = Gauge.get('server.DiscoveryRPC.latency') timer.start_timer() logger.info('%s handling Discovery RPC from %s' % (self.server_id, request.client_id)) response = DiscoveryResponse() # Sets the master_bns field in the response if there is a current # master. master = self.job.get_master() if master: response.master_bns = master.get_server_id() else: # We don't know who the master is. Counter.get('server.incomplete_discovery_response').inc() # Goes through the resource ids in the request and sets the # safe capacity for every resource that has a safe capacity # configured. for r in request.resource_id: t = global_config.find_resource_template(r) if t and t.HasField('safe_capacity'): safe = response.safe_capacity.add() safe.resource_id = r safe.safe_capacity = t.safe_capacity assert response.IsInitialized() timer.stop_timer() return response
def process_capacity_response(self, response): for resp in response.resource: assert resp.gets.capacity >= 0 resource = self.find_resource(resp.resource_id) n = sum_leases(resource) if resp.gets.capacity < n: logger.warning( '%s shortfall for %s: getting %lf, but has %lf outstanding leases' % (self.get_server_id(), resource.resource_id, resp.gets.capacity, n)) Counter.get('server_capacity_shortfall').inc() Gauge.get('server.%s.shortfall' % self.get_server_id()).set(resp.gets.capacity - n) resource.has.CopyFrom(resp.gets) # Schedules an action at the expirty time to clear out the lease. scheduler.add_absolute( resource.has.expiry_time, lambda: self._maybe_lease_expired(resource.resource_id))
def loop(self, duration): gauge = Gauge.get('scheduler.latency') gauge.start_timer() until = duration + clock.get_time() # Runs for the specified amount of time. while clock.get_time() < until: # Figures out when the first scheduled action is and advances the clock # until then. now = clock.get_time() t = min(self._first_time(), until) clock.set_time(t) # If there are scheduled items to be done right now, do them. if t in self.schedule: # Makes a copy of the list of actions that we need to execute and clears # out the schedule for this timestamp. This will allow actions to schedule # something for the current time. actions = list(self.schedule[t]) del self.schedule[t] # Executes all the actions we need to execute now. for task in actions: if type(task[0]) == types.LambdaType: task[0]() else: task[0].execute(task[1]) # And continue any threads which need to continue. for (thread, timestamp) in self.thread_map.iteritems(): if timestamp <= t: self.update_thread(thread, thread.thread_continue()) # Stop the timer gauge.stop_timer() # Runs the finalizers. logger.info('Running finalizers') for target in self.finalizers: if type(target) == types.LambdaType: target() else: target.finalize() # Ends the simulation. logger.info('End of simulation. It took %lf seconds' % gauge.get_value())
def Discovery_RPC(self, request): assert request.IsInitialized() timer = Gauge.get('server.DiscoveryRPC.latency') timer.start_timer() logger.info( '%s handling Discovery RPC from %s' % (self.server_id, request.client_id)) response = DiscoveryResponse() # Sets the master_bns field in the response if there is a current # master. master = self.job.get_master() if master: response.master_bns = master.get_server_id() else: # We don't know who the master is. Counter.get('server.incomplete_discovery_response').inc() # Goes through the resource ids in the request and sets the # safe capacity for every resource that has a safe capacity # configured. for r in request.resource_id: t = global_config.find_resource_template(r) if t and t.HasField('safe_capacity'): safe = response.safe_capacity.add() safe.resource_id = r safe.safe_capacity = t.safe_capacity assert response.IsInitialized() timer.stop_timer() return response
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource(request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField( 'last_request_time' ) and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = (self.state.find_client_resource(request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response
def final_report(self): if self.filename: fout = open(self.filename, 'w') else: fout = sys.stdout # Prints the first header line. print >>fout, ',', for client in sorted(self.all_clients): print >>fout, '"%s"' % client, ',,', print >>fout, ',', for server in sorted(self.all_server_jobs): print >>fout, '"%s"' % server, ',,,,', print >>fout, ',', for s in sorted(self.all_summaries): print >>fout, '"%s"' % s, if s == 'clients': print >>fout, ',,', else: print >>fout, ',,,,', print >>fout # Prints the second header line. print >>fout, '"Time",', for client in sorted(self.all_clients): print >>fout, '"wants", "has",', print >>fout, ',', for server in sorted(self.all_server_jobs): print >>fout, '"wants", "has", "leases", "outstanding",', print >>fout, ',', for s in self.all_summaries: if s == 'clients': print >>fout, '"total_wants", "total_has",', else: print >>fout, ('"total_wants", "total_has", "total_leases", ' '"total_outstanding",'), print >>fout # Goes through the data set in timestamp order. for time in sorted(self.data.keys()): print >>fout, time, ',', data = self.data[time] # Prints the reporting data for every client and server that we ever saw. # If we have no data for a timestamp we print nothing. for client in sorted(self.all_clients): if client in data: d = data[client] print >>fout, d.wants, ',', d.has, ',', else: print >>fout, ',,', print >>fout, ',', # Do the same for the servers. for server in sorted(self.all_server_jobs): if server in data: d = data[server] print >>fout, d.wants, ',', d.has, ',', d.leases, ',', d.outstanding, ',', else: print >>fout, ',,,,', # Now for the summaries print >>fout, ',', data = self.summaries[time] for s in sorted(self.all_summaries): if not s in data: if s == 'clients': print >>fout, ',,', else: print >>fout, ',,,,', continue d = data[s] if s == 'clients': print >>fout, d.total_wants, ',', d.total_has, ',', else: print >>fout, d.total_wants, ',', d.total_has, ',', d.total_leases, ',', d.total_outstanding, ',', print >>fout # Now we go an print the counters. print >>fout print >>fout, '"Name", "Value"' names = list() for counter in Counter.all_counters(): names.append(counter.get_name()) for name in sorted(names): counter = Counter.get(name) print >>fout, counter.get_name(), ',', counter.get_value() # And all the gauges. print >>fout print >>fout, '"Name", "N", "Min", "Average", "Max"' names = list() for gauge in Gauge.all_gauges(): names.append(gauge.get_name()) for name in sorted(names): gauge = Gauge.get(name) print >>fout, gauge.get_name(), ',', gauge.get_count(), ',', gauge.get_min_value( ), ',', gauge.get_average(), ',', gauge.get_max_value() # Closes the output file. if self.filename: fout.close() logger.info('Report written to %s' % self.filename)
def GetCapacity_RPC(self, request): assert request.IsInitialized() assert self.state.is_initialized() # If this server is not the master it cannot handle this request. # The client should do a new Discovery. if not self.is_master(): self.state.assert_clean() logger.info('%s getting a GetCapacity request when not master' % self.server_id) Counter.get('server.GetCapacity_RPC.not_master').inc() return None timer = Gauge.get('server.GetCapacity_RPC.latency') timer.start_timer() logger.debug(request) now = clock.get_time() # Cleanup the state. This removes resources and clients with expired # leases and such. self.state.cleanup() # A set of resources that we need to skip in step 2 (the actual # handing out of capacity. resources_to_skip = set() # First step: Go through the request and update the state with the # information from the request. for req in request.resource: # Finds the resource and the client state for this resource. (resource, cr) = self.state.find_client_resource( request.client_id, req.resource_id) # If this resource does not exist we don't need to do anything # right now. if resource: assert cr # Checks whether the last request from this client was at least # _kMinimumInterval seconds ago. if cr.HasField('last_request_time') and now - cr.last_request_time < _kMinimumInterval: logger.warning( '%s GetCapacity request for resource %s within the %d second ' 'threshold' % (self.server_id, req.resource_id, _kMinimumInterval)) resources_to_skip.add(req.resource_id) else: # Updates the state with the information in the request. cr.last_request_time = now cr.priority = req.priority cr.wants = req.wants if req.HasField('has'): cr.has.CopyFrom(req.has) else: cr.ClearField('has') # Creates a new response object in which we will insert the responses for # the resources contained in the request. response = GetCapacityResponse() # Step 2: Loop through all the individual resource requests in the request # and hand out capacity. for req in request.resource: # If this is a resource we need to skip, let's skip it. if req.resource_id in resources_to_skip: continue # Finds the resource and the client state for this resource. (resource, cr) = ( self.state.find_client_resource( request.client_id, req.resource_id)) # Adds a response proto to the overall response. resp = response.response.add() resp.resource_id = req.resource_id # If this is an unknown resource just give the client whatever it # is asking for. if not resource: assert not cr logger.warning( '%s GetCapacity request for unmanaged resource %s' % (self.server_id, req.resource_id)) resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources resp.gets.capacity = req.wants else: # Sets the safe capacity in the response if there is one # configured for this resource. if resource.template.HasField('safe_capacity'): resp.safe_capacity = resource.template.safe_capacity # Finds the algorithm implementation object for this resource. algo = AlgorithmImpl.create(resource.template, self.server_level) # If the resource is in learning mode we just return whatever the client # has now and create a default lease. if resource.learning_mode_expiry_time >= now: if cr.HasField('has'): has_now = cr.has.capacity else: has_now = 0 cr.has.CopyFrom(algo.create_lease(resource, has_now)) Counter.get('server.learning_mode_response').inc() else: # Otherwise we just run the algorithm. This will update the # client state object. algo.run_client(resource, cr) Counter.get('server.algorithm_runs').inc() # Copies the output from the algorithm run into the response. resp.gets.CopyFrom(cr.has) assert resp.IsInitialized() logger.info( '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' % (self.server_id, request.client_id, req.resource_id, req.wants, resp.gets.capacity, resp.gets.expiry_time - now, resp.gets.refresh_interval)) assert response.IsInitialized() timer.stop_timer() return response