Ejemplo n.º 1
0
        def mark_software_available(software, version):
            url = "{master_api}/agents/{agent}/software/".format(
                master_api=config.get("master_api"),
                agent=config.get("agent_id"))
            while True:
                try:
                    response = yield post_direct(url, data={
                            "software": software,
                            "version": version})
                except Exception as error:
                    delay = http_retry_delay()
                    logger.error(
                        "Failed to post availability of software %s, "
                        "version %s to master: %r. Will retry in %s "
                        "seconds.",
                        software, version, error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred

                else:
                    data = yield treq.content(response)

                    if response.code == OK:
                        logger.info("Posted availability of software %s, "
                                    "version %s to master.",
                                    software, version)
                        break

                    elif response.code >= INTERNAL_SERVER_ERROR:
                        delay = http_retry_delay()
                        logger.warning(
                            "Could not post availability of software %s, "
                            "version %s. The master responded with "
                            "INTERNAL_SERVER_ERROR.  Retrying in %s "
                            "seconds.", software, version, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred

                    else:
                        logger.error(
                            "Failed to post availability of software %s, "
                            "version %s: "
                            "Unexpected status from server %s. Data: %s",
                            software, version, response.code, data)
                        break

            if self.testing:
                self.operation_deferred.callback(None)
Ejemplo n.º 2
0
    def post_agent_to_master(self):
        """
        Runs the POST request to contact the master.  Running this method
        multiple times should be considered safe but is generally something
        that should be avoided.
        """
        url = self.agents_endpoint()
        data = self.system_data()

        try:
            response = yield post_direct(url, data=data)
        except Exception as failure:
            delay = http_retry_delay()
            if isinstance(failure, ConnectionRefusedError):
                svclog.error(
                    "Failed to POST agent to master, the connection was "
                    "refused. Retrying in %s seconds", delay)
            else:  # pragma: no cover
                svclog.error(
                    "Unhandled error when trying to POST the agent to the "
                    "master. The error was %s.", failure)

            if not self.shutting_down:
                svclog.info(
                    "Retrying failed POST to master in %s seconds.", delay)
                yield deferLater(reactor, delay, self.post_agent_to_master)
            else:
                svclog.warning("Not retrying POST to master, shutting down.")

        else:
            # Master might be down or have some other internal problems
            # that might eventually be fixed.  Retry the request.
            if response.code >= INTERNAL_SERVER_ERROR:
                if not self.shutting_down:
                    delay = http_retry_delay()
                    svclog.warning(
                        "Failed to post to master due to a server side error "
                        "error %s, retrying in %s seconds",
                        response.code, delay)
                    yield deferLater(reactor, delay, self.post_agent_to_master)
                else:
                    svclog.warning(
                        "Failed to post to master due to a server side error "
                        "error %s. Not retrying, because the agent is "
                        "shutting down", response.code)

            # Master is up but is rejecting our request because there's
            # something wrong with it.  Do not retry the request.
            elif response.code >= BAD_REQUEST:
                text = yield response.text()
                svclog.error(
                    "%s accepted our POST request but responded with code %s "
                    "which is a client side error.  The message the server "
                    "responded with was %r.  Sorry, but we cannot retry this "
                    "request as it's an issue with the agent's request.",
                    url, response.code, text)

            else:
                data = yield treq.json_content(response)
                config["agent_id"] = data["id"]
                config.master_contacted()

                if response.code == OK:
                    svclog.info(
                        "POST to %s was successful. Agent %s was updated.",
                        url, config["agent_id"])

                elif response.code == CREATED:
                    svclog.info(
                        "POST to %s was successful.  A new agent "
                        "with an id of %s was created.",
                        url, config["agent_id"])

                returnValue(data)
Ejemplo n.º 3
0
    def post_shutdown_to_master(self):
        """
        This method is called before the reactor shuts down and lets the
        master know that the agent's state is now ``offline``
        """
        # We're under the assumption that something's wrong with
        # our code if we try to call this method before self.shutting_down
        # is set.
        assert self.shutting_down
        yield self.post_shutdown_lock.acquire()

        svclog.info("Informing master of shutdown")

        # Because post_shutdown_to_master is blocking and needs to
        # stop the reactor from finishing we perform the retry in-line
        data = None
        tries = 0
        num_retry_errors = 0
        response = None
        timed_out = False
        while True:
            tries += 1
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": AgentState.OFFLINE,
                        "free_ram": memory.free_ram(),
                        "current_assignments": config["current_assignments"]})

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to post shutdown to the master, "
                        "caught try-again errors %s times in a row.",
                        num_retry_errors)
                    break
                elif self.shutdown_timeout < datetime.utcnow():
                    svclog.error("While posting shutdown to master, caught "
                                 "%s. Shutdown timeout has been reached, not "
                                 "retrying.",
                                 error.__class__.__name__)
                    break
                else:
                    svclog.debug("While posting shutdown to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            # When we get a hard failure it could be an issue with the
            # server, although it's unlikely, so we retry.  Only retry
            # for a set period of time though since the shutdown as a timeout
            except Exception as failure:
                if self.shutdown_timeout > datetime.utcnow():
                    delay = http_retry_delay()
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Retrying in %s seconds",
                        failure, delay)

                    # Wait for 'pause' to fire, introducing a delay
                    pause = Deferred()
                    reactor.callLater(delay, pause.callback, None)
                    yield pause

                else:
                    timed_out = True
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Shutdown timeout reached, not retrying.",
                        failure)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == NOT_FOUND:
                    svclog.warning(
                        "Agent %r no longer exists, cannot update state.",
                        config["agent_id"])
                    break

                elif response.code == OK:
                    svclog.info(
                        "Agent %r has POSTed shutdown state change "
                        "successfully.",
                        config["agent_id"])
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if self.shutdown_timeout > datetime.utcnow():
                        delay = http_retry_delay()
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Retrying in %s seconds.",
                            data, delay)

                        # Wait for 'pause' to fire, introducing a delay
                        pause = Deferred()
                        reactor.callLater(delay, pause.callback, None)
                        yield pause
                    else:
                        timed_out = True
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Shutdown timeout reached, not retrying.",
                            data)
                        break

        yield self.post_shutdown_lock.release()
        extra_data = {
            "response": response,
            "timed_out": timed_out,
            "tries": tries,
            "retry_errors": num_retry_errors
        }

        if isinstance(data, dict):
            data.update(extra_data)
        else:
            data = extra_data

        returnValue(data)
Ejemplo n.º 4
0
    def reannounce(self, force=False):
        """
        Method which is used to periodically contact the master.  This
        method is generally called as part of a scheduled task.
        """
        # Attempt to acquire the reannounce lock but fail after 70%
        # of the total time between reannouncements elapses.  This should
        # help prevent an accumulation of requests in the event the master
        # is having issues.
        try:
            yield self.reannounce_lock.acquire(
                config["agent_master_reannounce"] * .70
            )
        except utility.LockTimeoutError:
            svclog.debug("Timed out while waiting to acquire reannounce_lock")
            returnValue(None)

        if not self.should_reannounce() and not force:
            yield self.reannounce_lock.release()
            returnValue(None)

        svclog.debug("Announcing %s to master", config["agent_hostname"])
        data = None
        num_retry_errors = 0
        while True:  # for retries
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": config["state"],
                        "current_assignments": config.get(
                            "current_assignments", {} # may not be set yet
                        ),
                        "free_ram": memory.free_ram(),
                        "disks": disks.disks(as_dict=True)
                    }
                )

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to announce self to the master, "
                        "caught try-again type errors %s times in a row.",
                        num_retry_errors)
                    break
                else:
                    svclog.debug("While announcing self to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            except Exception as error:
                if force:
                    delay = http_retry_delay()
                    svclog.error(
                        "Failed to announce self to the master: %s.  Will "
                        "retry in %s seconds.", error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred
                else:
                    # Don't retry because reannounce is called periodically
                    svclog.error(
                        "Failed to announce self to the master: %s.  This "
                        "request will not be retried.", error)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == OK:
                    config.master_contacted(announcement=True)
                    svclog.info("Announced self to the master server.")
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if not self.shutting_down:
                        delay = http_retry_delay()
                        svclog.warning(
                            "Could not announce self to the master server, "
                            "internal server error: %s.  Retrying in %s "
                            "seconds.", data, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred
                    else:
                        svclog.warning(
                            "Could not announce to master. Not retrying "
                            "because of pending shutdown.")
                        break

                elif response.code == NOT_FOUND:
                    svclog.warning("The master says it does not know about our "
                                   "agent id. Posting as a new agent.")
                    yield self.post_agent_to_master()
                    break

                # If this is a client problem retrying the request
                # is unlikely to fix the issue so we stop here
                elif response.code >= BAD_REQUEST:
                    svclog.error(
                        "Failed to announce self to the master, bad "
                        "request: %s.  This request will not be retried.",
                        data)
                    break

                else:
                    svclog.error(
                        "Unhandled error when posting self to the "
                        "master: %s (code: %s).  This request will not be "
                        "retried.", data, response.code)
                    break

        yield self.reannounce_lock.release()
        returnValue(data)
Ejemplo n.º 5
0
 def post(self, url, **kwargs):
     return post_direct(self.get_url(url), **kwargs)