Esempio n. 1
0
    def thread_continue(self):
        # If we are not the master server in the job, we don't need to
        # do anything. Our next scheduled action is at the end of time.
        # Note: When we become the master we will update this interval.
        if not self.is_master():
            Counter.get('server.halt_thread').inc()
            return _kTheEndOfTime

        # If this is not the root server we might need to do a discovery.
        if self.server_level > 0:
            # If we don't know who the master is let's figure this out.
            if not self.master:
                # If discovery failed, try another discovery in the
                # near future
                if not self._discover():
                    return _kDefaultDiscoveryInterval

        # Either we know who the master is or we don't need to know because
        # we are the root server. Let's get some capacity. If this
        # fails we need to reschedule a discovery.
        if not self._get_capacity():
            Counter.get('server.reschedule_discovery').inc()
            self.master = None

            return 0

        # Returns the interval in which we need to refresh our capacity
        # leases.
        return self._renew_capacity_interval()
Esempio n. 2
0
  def _discover(self):
    assert self.state.IsInitialized()

    request = DiscoveryRequest()
    request.client_id = self.state.client_id

    # Adds all the resources we know about so that we can get
    # the safe capacities for them.
    for r in self.state.resource:
      request.resource_id.append(r.resource_id)

    # Sends the request to a random task in the server job.
    response = self.downstream_job.get_random_task().Discovery_RPC(request)

    # If the response has a master_bns field we store the reference
    # to the master. If not there is no master, which would suck.
    if response.HasField('master_bns'):
      self.master = self.downstream_job.get_task_by_name(response.master_bns)
    else:
      self.master = None
      logger.warning('%s doesn\'t know who the master is.' %
                     self.state.client_id)
      Counter.get('client.discovery_failure').inc()

    # Goes through the response and stores all the safe capacities in the
    # client state.
    for safe in response.safe_capacity:
      self._find_resource(safe.resource_id).safe_capacity = safe.safe_capacity

    # Returns the server we just discovered to be the master.
    return self.master
Esempio n. 3
0
    def Discovery_RPC(self, request):
        assert request.IsInitialized()

        timer = Gauge.get('server.DiscoveryRPC.latency')
        timer.start_timer()
        logger.info('%s handling Discovery RPC from %s' %
                    (self.server_id, request.client_id))
        response = DiscoveryResponse()

        # Sets the master_bns field in the response if there is a current
        # master.
        master = self.job.get_master()

        if master:
            response.master_bns = master.get_server_id()
        else:
            # We don't know who the master is.
            Counter.get('server.incomplete_discovery_response').inc()

        # Goes through the resource ids in the request and sets the
        # safe capacity for every resource that has a safe capacity
        # configured.
        for r in request.resource_id:
            t = global_config.find_resource_template(r)

            if t and t.HasField('safe_capacity'):
                safe = response.safe_capacity.add()
                safe.resource_id = r
                safe.safe_capacity = t.safe_capacity

        assert response.IsInitialized()

        timer.stop_timer()

        return response
Esempio n. 4
0
  def thread_continue(self):
    # If we are not the master server in the job, we don't need to
    # do anything. Our next scheduled action is at the end of time.
    # Note: When we become the master we will update this interval.
    if not self.is_master():
      Counter.get('server.halt_thread').inc()
      return _kTheEndOfTime

    # If this is not the root server we might need to do a discovery.
    if self.server_level > 0:
      # If we don't know who the master is let's figure this out.
      if not self.master:
        # If discovery failed, try another discovery in the
        # near future
        if not self._discover():
          return _kDefaultDiscoveryInterval

    # Either we know who the master is or we don't need to know because
    # we are the root server. Let's get some capacity. If this
    # fails we need to reschedule a discovery.
    if not self._get_capacity():
      Counter.get('server.reschedule_discovery').inc()
      self.master = None

      return 0

    # Returns the interval in which we need to refresh our capacity
    # leases.
    return self._renew_capacity_interval()
Esempio n. 5
0
  def _maybe_lease_expired(self, resource_id):
    resource = self._find_resource(resource_id)

    if lease_expired(resource):
      resource.ClearField('has')
      logger.info(
          '%s lease on capacity for resource %s expired' %
          (self.get_client_id(), resource.resource_id))
      Counter.get('client.lease_expired').inc()
Esempio n. 6
0
  def _get_capacity(self):
    assert self.state.IsInitialized()
    assert self.master

    # If there are no resources in the state we're done.
    if len(self.state.resource) == 0:
      return

    # Creates the RPC request object.
    request = GetCapacityRequest()
    request.client_id = self.state.client_id

    # Goes through all the resources in the state and adds a subrequest
    # for that resource to the request.
    for res in self.state.resource:
      req = request.resource.add()
      req.resource_id = res.resource_id
      req.priority = res.priority
      req.wants = res.wants

      if res.HasField('has'):
        req.has.CopyFrom(res.has)

      # Calls the GetCapacity RPC in the master.
      response = self.master.GetCapacity_RPC(request)

      # If that failed we did not get new capacities. Blimey! Most probably
      # the server we talked to is no longer the master, so schedule a discovery
      # asap. When that discovery succeeds it will call us again.
      if not response:
        logger.error('%s GetCapacity request failed!' % self.get_client_id())
        Counter.get('client.GetCapacity_RPC.failure').inc()
        return False
      else:
        # Goes through the response and copies the capacity information back into
        # the client state.
        for r in response.response:
          assert r.gets.capacity >= 0

          resource = self._find_resource(r.resource_id)
          resource.has.CopyFrom(r.gets)

          # Schedules an action at the expiry time to clear out the lease.
          scheduler.add_absolute(
              resource.has.expiry_time,
              lambda: self._maybe_lease_expired(r.resource_id))

          if r.HasField('safe_capacity'):
            resource.safe_capacity = r.safe_capacity
          else:
            resource.ClearField('safe_capacity')

    return True
    def _maybe_lease_expired(self, resource_id):
        # If we are no longer the master this action does not need to'
        # be executed anymore.
        if not self.server.is_master():
            return

        resource = self.find_resource(resource_id)

        if lease_expired(resource):
            resource.ClearField('has')
            logger.info('%s lease on capacity for resource %s expired' %
                        (self.get_server_id(), resource.resource_id))
            Counter.get('server.lease_expired').inc()
  def _maybe_lease_expired(self, resource_id):
    # If we are no longer the master this action does not need to'
    # be executed anymore.
    if not self.server.is_master():
      return

    resource = self.find_resource(resource_id)

    if lease_expired(resource):
      resource.ClearField('has')
      logger.info(
          '%s lease on capacity for resource %s expired' %
          (self.get_server_id(), resource.resource_id))
      Counter.get('server.lease_expired').inc()
Esempio n. 9
0
def random_mishap():
  scheduler.add_relative(60, lambda: random_mishap())

  total = max(_mishap_map.keys())
  m = random.randint(0, total - 1)
  n = 0

  for (key, value) in _mishap_map.iteritems():
    if n >= m:
      Counter.get('mishap.%d' % key).inc()
      value()
      return

    n += key

  assert False
Esempio n. 10
0
def random_mishap():
    scheduler.add_relative(60, lambda: random_mishap())

    total = max(_mishap_map.keys())
    m = random.randint(0, total - 1)
    n = 0

    for (key, value) in _mishap_map.iteritems():
        if n >= m:
            Counter.get('mishap.%d' % key).inc()
            value()
            return

        n += key

    assert False
Esempio n. 11
0
    def _renew_capacity_interval(self):
        # Figures out the smallest refresh_interval in the server state.
        delay = sys.maxint

        for resource in self.state.all_resources():
            if resource.HasField('has'):
                delay = min(delay, resource.has.refresh_interval)

        # If that delay is highly improbable we have some error and we use
        # a default delay. This might for instance happen if all resources
        # have lost their (or never gotten any) leases.
        if delay <= 0 or delay == sys.maxint:
            logger.error('%s improbable delay %d, set to %d instead' %
                         (self.server_id, delay, _kDefaultRefreshInterval))
            delay = _kDefaultRefreshInterval
            Counter.get('server.improbable.delay').inc()

        return delay
Esempio n. 12
0
  def _renew_capacity_interval(self):
    # Figures out the smallest refresh_interval in the client state.
    delay = sys.maxint

    for r in self.state.resource:
      if r.HasField('has'):
        delay = min(delay, r.has.refresh_interval)

    # If that delay is highly improbable we have some error and we use
    # a default delay.
    if delay <= 0 or delay == sys.maxint:
      logger.error(
          '%s improbable delay %d, set to %d instead' %
          (self.state.client_id, delay, _kDefaultRefreshInterval))
      delay = _kDefaultRefreshInterval
      Counter.get('client.improbable.delay').inc()

    return delay
Esempio n. 13
0
  def _discover(self):
    assert self.server_level > 0

    request = DiscoveryRequest()
    request.client_id = self.server_id

    # Sends the request to a random task in the server job.
    response = self.downstream_job.get_random_task().Discovery_RPC(request)

    # If the response has a master_bns field we store the reference
    # to the master. If not there is no master, which would suck.
    if response.HasField('master_bns'):
      self.master = self.downstream_job.get_task_by_name(response.master_bns)
    else:
      self.master = None
      logger.warning('%s doesn\'t know who the master is.' % self.server_id)
      Counter.get('server.discovery_failure').inc()

    return self.master
Esempio n. 14
0
  def _renew_capacity_interval(self):
    # Figures out the smallest refresh_interval in the server state.
    delay = sys.maxint

    for resource in self.state.all_resources():
      if resource.HasField('has'):
        delay = min(delay, resource.has.refresh_interval)

    # If that delay is highly improbable we have some error and we use
    # a default delay. This might for instance happen if all resources
    # have lost their (or never gotten any) leases.
    if delay <= 0 or delay == sys.maxint:
      logger.error(
          '%s improbable delay %d, set to %d instead' %
          (self.server_id, delay, _kDefaultRefreshInterval))
      delay = _kDefaultRefreshInterval
      Counter.get('server.improbable.delay').inc()

    return delay
Esempio n. 15
0
    def _discover(self):
        assert self.server_level > 0

        request = DiscoveryRequest()
        request.client_id = self.server_id

        # Sends the request to a random task in the server job.
        response = self.downstream_job.get_random_task().Discovery_RPC(request)

        # If the response has a master_bns field we store the reference
        # to the master. If not there is no master, which would suck.
        if response.HasField('master_bns'):
            self.master = self.downstream_job.get_task_by_name(
                response.master_bns)
        else:
            self.master = None
            logger.warning('%s doesn\'t know who the master is.' %
                           self.server_id)
            Counter.get('server.discovery_failure').inc()

        return self.master
    def process_capacity_response(self, response):
        for resp in response.resource:
            assert resp.gets.capacity >= 0

            resource = self.find_resource(resp.resource_id)
            n = sum_leases(resource)

            if resp.gets.capacity < n:
                logger.warning(
                    '%s shortfall for %s: getting %lf, but has %lf outstanding leases'
                    % (self.get_server_id(), resource.resource_id,
                       resp.gets.capacity, n))
                Counter.get('server_capacity_shortfall').inc()
                Gauge.get('server.%s.shortfall' %
                          self.get_server_id()).set(resp.gets.capacity - n)

            resource.has.CopyFrom(resp.gets)

            # Schedules an action at the expirty time to clear out the lease.
            scheduler.add_absolute(
                resource.has.expiry_time,
                lambda: self._maybe_lease_expired(resource.resource_id))
Esempio n. 17
0
  def process_capacity_response(self, response):
    for resp in response.resource:
      assert resp.gets.capacity >= 0

      resource = self.find_resource(resp.resource_id)
      n = sum_leases(resource)

      if resp.gets.capacity < n:
        logger.warning(
            '%s shortfall for %s: getting %lf, but has %lf outstanding leases' %
            (self.get_server_id(), resource.resource_id,
             resp.gets.capacity, n))
        Counter.get('server_capacity_shortfall').inc()
        Gauge.get('server.%s.shortfall' %
                  self.get_server_id()).set(resp.gets.capacity - n)

      resource.has.CopyFrom(resp.gets)

      # Schedules an action at the expirty time to clear out the lease.
      scheduler.add_absolute(
          resource.has.expiry_time,
          lambda: self._maybe_lease_expired(resource.resource_id))
Esempio n. 18
0
  def Discovery_RPC(self, request):
    assert request.IsInitialized()

    timer = Gauge.get('server.DiscoveryRPC.latency')
    timer.start_timer()
    logger.info(
        '%s handling Discovery RPC from %s' %
        (self.server_id, request.client_id))
    response = DiscoveryResponse()

    # Sets the master_bns field in the response if there is a current
    # master.
    master = self.job.get_master()

    if master:
      response.master_bns = master.get_server_id()
    else:
      # We don't know who the master is.
      Counter.get('server.incomplete_discovery_response').inc()

    # Goes through the resource ids in the request and sets the
    # safe capacity for every resource that has a safe capacity
    # configured.
    for r in request.resource_id:
      t = global_config.find_resource_template(r)

      if t and t.HasField('safe_capacity'):
        safe = response.safe_capacity.add()
        safe.resource_id = r
        safe.safe_capacity = t.safe_capacity

    assert response.IsInitialized()

    timer.stop_timer()

    return response
Esempio n. 19
0
    def GetCapacity_RPC(self, request):
        assert request.IsInitialized()
        assert self.state.is_initialized()

        # If this server is not the master it cannot handle this request.
        # The client should do a new Discovery.
        if not self.is_master():
            self.state.assert_clean()
            logger.info('%s getting a GetCapacity request when not master' %
                        self.server_id)
            Counter.get('server.GetCapacity_RPC.not_master').inc()

            return None

        timer = Gauge.get('server.GetCapacity_RPC.latency')
        timer.start_timer()
        logger.debug(request)
        now = clock.get_time()

        # Cleanup the state. This removes resources and clients with expired
        # leases and such.
        self.state.cleanup()

        # A set of resources that we need to skip in step 2 (the actual
        # handing out of capacity.
        resources_to_skip = set()

        # First step: Go through the request and update the state with the
        # information from the request.
        for req in request.resource:
            # Finds the resource and the client state for this resource.
            (resource,
             cr) = self.state.find_client_resource(request.client_id,
                                                   req.resource_id)

            # If this resource does not exist we don't need to do anything
            # right now.
            if resource:
                assert cr

                # Checks whether the last request from this client was at least
                # _kMinimumInterval seconds ago.
                if cr.HasField(
                        'last_request_time'
                ) and now - cr.last_request_time < _kMinimumInterval:
                    logger.warning(
                        '%s GetCapacity request for resource %s within the %d second '
                        'threshold' %
                        (self.server_id, req.resource_id, _kMinimumInterval))
                    resources_to_skip.add(req.resource_id)
                else:
                    # Updates the state with the information in the request.
                    cr.last_request_time = now
                    cr.priority = req.priority
                    cr.wants = req.wants

                    if req.HasField('has'):
                        cr.has.CopyFrom(req.has)
                    else:
                        cr.ClearField('has')

        # Creates a new response object in which we will insert the responses for
        # the resources contained in the request.
        response = GetCapacityResponse()

        # Step 2: Loop through all the individual resource requests in the request
        # and hand out capacity.
        for req in request.resource:
            # If this is a resource we need to skip, let's skip it.
            if req.resource_id in resources_to_skip:
                continue

            # Finds the resource and the client state for this resource.
            (resource,
             cr) = (self.state.find_client_resource(request.client_id,
                                                    req.resource_id))

            # Adds a response proto to the overall response.
            resp = response.response.add()
            resp.resource_id = req.resource_id

            # If this is an unknown resource just give the client whatever it
            # is asking for.
            if not resource:
                assert not cr

                logger.warning(
                    '%s GetCapacity request for unmanaged resource %s' %
                    (self.server_id, req.resource_id))
                resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
                resp.gets.capacity = req.wants
            else:
                # Sets the safe capacity in the response if there is one
                # configured for this resource.
                if resource.template.HasField('safe_capacity'):
                    resp.safe_capacity = resource.template.safe_capacity

                # Finds the algorithm implementation object for this resource.
                algo = AlgorithmImpl.create(resource.template,
                                            self.server_level)

                # If the resource is in learning mode we just return whatever the client
                # has now and create a default lease.
                if resource.learning_mode_expiry_time >= now:
                    if cr.HasField('has'):
                        has_now = cr.has.capacity
                    else:
                        has_now = 0

                    cr.has.CopyFrom(algo.create_lease(resource, has_now))
                    Counter.get('server.learning_mode_response').inc()
                else:
                    # Otherwise we just run the algorithm. This will update the
                    # client state object.
                    algo.run_client(resource, cr)
                    Counter.get('server.algorithm_runs').inc()

                # Copies the output from the algorithm run into the response.
                resp.gets.CopyFrom(cr.has)

            assert resp.IsInitialized()
            logger.info(
                '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d'
                % (self.server_id, request.client_id, req.resource_id,
                   req.wants, resp.gets.capacity, resp.gets.expiry_time - now,
                   resp.gets.refresh_interval))

        assert response.IsInitialized()

        timer.stop_timer()

        return response
Esempio n. 20
0
  def final_report(self):
    if self.filename:
      fout = open(self.filename, 'w')
    else:
      fout = sys.stdout

    # Prints the first header line.
    print >>fout, ',',

    for client in sorted(self.all_clients):
      print >>fout, '"%s"' % client, ',,',

    print >>fout, ',',

    for server in sorted(self.all_server_jobs):
      print >>fout, '"%s"' % server, ',,,,',

    print >>fout, ',',

    for s in sorted(self.all_summaries):
      print >>fout, '"%s"' % s,

      if s == 'clients':
        print >>fout, ',,',
      else:
        print >>fout, ',,,,',

    print >>fout

    # Prints the second header line.
    print >>fout, '"Time",',

    for client in sorted(self.all_clients):
      print >>fout, '"wants", "has",',

    print >>fout, ',',

    for server in sorted(self.all_server_jobs):
      print >>fout, '"wants", "has", "leases", "outstanding",',

    print >>fout, ',',

    for s in self.all_summaries:
      if s == 'clients':
        print >>fout, '"total_wants", "total_has",',
      else:
        print >>fout, ('"total_wants", "total_has", "total_leases", '
                       '"total_outstanding",'),

    print >>fout

    # Goes through the data set in timestamp order.
    for time in sorted(self.data.keys()):
      print >>fout, time, ',',
      data = self.data[time]

      # Prints the reporting data for every client and server that we ever saw.
      # If we have no data for a timestamp we print nothing.
      for client in sorted(self.all_clients):
        if client in data:
          d = data[client]
          print >>fout, d.wants, ',', d.has, ',',
        else:
          print >>fout, ',,',

      print >>fout, ',',

      # Do the same for the servers.
      for server in sorted(self.all_server_jobs):
        if server in data:
          d = data[server]
          print >>fout, d.wants, ',', d.has, ',', d.leases, ',', d.outstanding, ',',
        else:
          print >>fout, ',,,,',

      # Now for the summaries
      print >>fout, ',',

      data = self.summaries[time]

      for s in sorted(self.all_summaries):
        if not s in data:
          if s == 'clients':
            print >>fout, ',,',
          else:
            print >>fout, ',,,,',

          continue

        d = data[s]

        if s == 'clients':
          print >>fout, d.total_wants, ',', d.total_has, ',',
        else:
          print >>fout, d.total_wants, ',', d.total_has, ',', d.total_leases, ',', d.total_outstanding, ',',

      print >>fout

    # Now we go an print the counters.
    print >>fout
    print >>fout, '"Name", "Value"'
    names = list()

    for counter in Counter.all_counters():
      names.append(counter.get_name())

    for name in sorted(names):
      counter = Counter.get(name)
      print >>fout, counter.get_name(), ',', counter.get_value()

    # And all the gauges.
    print >>fout
    print >>fout, '"Name", "N", "Min", "Average", "Max"'
    names = list()

    for gauge in Gauge.all_gauges():
      names.append(gauge.get_name())

    for name in sorted(names):
      gauge = Gauge.get(name)
      print >>fout, gauge.get_name(), ',', gauge.get_count(), ',', gauge.get_min_value(
          ), ',', gauge.get_average(), ',', gauge.get_max_value()

    # Closes the output file.
    if self.filename:
      fout.close()
      logger.info('Report written to %s' % self.filename)
Esempio n. 21
0
  def GetCapacity_RPC(self, request):
    assert request.IsInitialized()
    assert self.state.is_initialized()

    # If this server is not the master it cannot handle this request.
    # The client should do a new Discovery.
    if not self.is_master():
      self.state.assert_clean()
      logger.info('%s getting a GetCapacity request when not master' %
                  self.server_id)
      Counter.get('server.GetCapacity_RPC.not_master').inc()

      return None

    timer = Gauge.get('server.GetCapacity_RPC.latency')
    timer.start_timer()
    logger.debug(request)
    now = clock.get_time()

    # Cleanup the state. This removes resources and clients with expired
    # leases and such.
    self.state.cleanup()

    # A set of resources that we need to skip in step 2 (the actual
    # handing out of capacity.
    resources_to_skip = set()

    # First step: Go through the request and update the state with the
    # information from the request.
    for req in request.resource:
       # Finds the resource and the client state for this resource.
      (resource, cr) = self.state.find_client_resource(
          request.client_id,
          req.resource_id)

      # If this resource does not exist we don't need to do anything
      # right now.
      if resource:
        assert cr

        # Checks whether the last request from this client was at least
        # _kMinimumInterval seconds ago.
        if cr.HasField('last_request_time') and now - cr.last_request_time < _kMinimumInterval:
          logger.warning(
              '%s GetCapacity request for resource %s within the %d second '
              'threshold' %
              (self.server_id, req.resource_id, _kMinimumInterval))
          resources_to_skip.add(req.resource_id)
        else:
          # Updates the state with the information in the request.
          cr.last_request_time = now
          cr.priority = req.priority
          cr.wants = req.wants

          if req.HasField('has'):
            cr.has.CopyFrom(req.has)
          else:
            cr.ClearField('has')

    # Creates a new response object in which we will insert the responses for
    # the resources contained in the request.
    response = GetCapacityResponse()

    # Step 2: Loop through all the individual resource requests in the request
    # and hand out capacity.
    for req in request.resource:
      # If this is a resource we need to skip, let's skip it.
      if req.resource_id in resources_to_skip:
        continue

      # Finds the resource and the client state for this resource.
      (resource, cr) = (
          self.state.find_client_resource(
              request.client_id,
              req.resource_id))

      # Adds a response proto to the overall response.
      resp = response.response.add()
      resp.resource_id = req.resource_id

      # If this is an unknown resource just give the client whatever it
      # is asking for.
      if not resource:
        assert not cr

        logger.warning(
            '%s GetCapacity request for unmanaged resource %s' %
            (self.server_id, req.resource_id))
        resp.gets.expiry_time = now + _kDefaultLeaseTimeForUnknownResources
        resp.gets.capacity = req.wants
      else:
        # Sets the safe capacity in the response if there is one
        # configured for this resource.
        if resource.template.HasField('safe_capacity'):
          resp.safe_capacity = resource.template.safe_capacity

        # Finds the algorithm implementation object for this resource.
        algo = AlgorithmImpl.create(resource.template, self.server_level)

        # If the resource is in learning mode we just return whatever the client
        # has now and create a default lease.
        if resource.learning_mode_expiry_time >= now:
          if cr.HasField('has'):
            has_now = cr.has.capacity
          else:
            has_now = 0

          cr.has.CopyFrom(algo.create_lease(resource, has_now))
          Counter.get('server.learning_mode_response').inc()
        else:
          # Otherwise we just run the algorithm. This will update the
          # client state object.
          algo.run_client(resource, cr)
          Counter.get('server.algorithm_runs').inc()

        # Copies the output from the algorithm run into the response.
        resp.gets.CopyFrom(cr.has)

      assert resp.IsInitialized()
      logger.info(
          '%s for %s resource: %s wants: %lf gets: %lf lease: %d refresh: %d' %
          (self.server_id, request.client_id, req.resource_id, req.wants,
           resp.gets.capacity, resp.gets.expiry_time - now,
           resp.gets.refresh_interval))

    assert response.IsInitialized()

    timer.stop_timer()

    return response