def test_config_defaults(self):
     config["agent_http_retry_delay_factor"] = 0
     config["agent_http_retry_delay_offset"] = 0
     self.assertEqual(http_retry_delay(), 0)
     config["agent_http_retry_delay_factor"] = 0
     config["agent_http_retry_delay_offset"] = 1
     self.assertEqual(http_retry_delay(), 1)
Exemple #2
0
        def mark_software_available(software, version):
            url = "{master_api}/agents/{agent}/software/".format(
                master_api=config.get("master_api"),
                agent=config.get("agent_id"))
            while True:
                try:
                    response = yield post_direct(url, data={
                            "software": software,
                            "version": version})
                except Exception as error:
                    delay = http_retry_delay()
                    logger.error(
                        "Failed to post availability of software %s, "
                        "version %s to master: %r. Will retry in %s "
                        "seconds.",
                        software, version, error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred

                else:
                    data = yield treq.content(response)

                    if response.code == OK:
                        logger.info("Posted availability of software %s, "
                                    "version %s to master.",
                                    software, version)
                        break

                    elif response.code >= INTERNAL_SERVER_ERROR:
                        delay = http_retry_delay()
                        logger.warning(
                            "Could not post availability of software %s, "
                            "version %s. The master responded with "
                            "INTERNAL_SERVER_ERROR.  Retrying in %s "
                            "seconds.", software, version, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred

                    else:
                        logger.error(
                            "Failed to post availability of software %s, "
                            "version %s: "
                            "Unexpected status from server %s. Data: %s",
                            software, version, response.code, data)
                        break

            if self.testing:
                self.operation_deferred.callback(None)
Exemple #3
0
        def mark_software_not_available(software, version):
            url = ("{master_api}/agents/{agent}/software/{software}/"
                    "versions/{version}").format(
                        master_api=config.get("master_api"),
                        agent=config.get("agent_id"),
                        software=software,
                        version=version)
            while True:
                try:
                    response = yield delete_direct(url)
                except Exception as error:
                    delay = http_retry_delay()
                    logger.error(
                        "Failed to remove software %s, version %s from this "
                        "agent on master: %r. Will retry in %s seconds.",
                        software, version, error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred

                else:
                    data = yield treq.content(response)

                    if response.code in [OK, ACCEPTED, NO_CONTENT]:
                        logger.info("Removed software %s, version %s from this "
                                    "agent on master.", software, version)
                        break

                    elif response.code >= INTERNAL_SERVER_ERROR:
                        delay = http_retry_delay()
                        logger.warning(
                            "Could not remove software %s, version %s from "
                            "this agent. The master responded with "
                            "INTERNAL_SERVER_ERROR.  Retrying in %s "
                            "seconds.", software, version, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred

                    else:
                        logger.error(
                            "Failed to remove software %s, version %s from "
                            "this agent: "
                            "Unexpected status from server %s. Data: %s",
                            software, version, response.code, data)
                        break

            if self.testing:
                self.operation_deferred.callback(None)
Exemple #4
0
 def update_failed(arg):
     logger.error("Downloading update failed: %r", arg)
     if isinstance(arg, twisted.python.failure.Failure):
         logger.error("Traceback: %s", arg.getTraceback())
     reactor.callLater(http_retry_delay(),
                       download_update,
                       data["version"])
Exemple #5
0
        def result_callback(url, log_identifier, response):
            if 500 <= response.code < 600:
                delay = http_retry_delay()
                logger.error(
                    "Server side error while uploading log file %s, " "status code: %s. Retrying. in %s seconds",
                    log_identifier,
                    response.code,
                    delay,
                )
                upload(url, log_identifier, delay=delay)

            elif response.code not in [OK, CREATED, ACCEPTED]:
                # Nothing else we could do about that, this is
                # a problem on our end.
                logger.error(
                    "Could not upload logfile %s status code: %s. " "This is a client side error, giving up.",
                    log_identifier,
                    response.code,
                )
                try:
                    upload_deferred.errback(ValueError("Bad return code on uploading logfile: %s" % response.code))
                except Exception as e:
                    logger.error("Caught exception calling upload_deferred.errback: %s", e)

            else:
                logger.info("Uploaded logfile %s for to master", log_identifier)
                try:
                    upload_deferred.callback(None)
                except Exception as e:
                    logger.error("Caught exception calling upload_deferred.callback: %s", e)
Exemple #6
0
 def error_callback(task, log_path, deferred, num_retry_errors, failure_reason):
     if num_retry_errors > config["broken_connection_max_retry"]:
         logger.error(
             "Error while registering logfile %s for task %s on master. "
             "Maximum number of retries reached. Not retrying the "
             "request.",
             log_path,
             task["id"],
         )
         deferred.errback(None)
     else:
         if failure_reason.type in (ResponseNeverReceived, RequestTransmissionFailed):
             logger.debug(
                 "Error while registering logfile %s for task %s on " "master: %s, retrying immediately",
                 log_path,
                 task["id"],
                 failure_reason.type.__name__,
             )
             post_logfile(task, log_path, post_deferred=deferred)
         else:
             delay = http_retry_delay()
             logger.error(
                 "Error while registering logfile %s for task %s on " "master: %r, retrying in %s seconds.",
                 log_path,
                 task["id"],
                 failure_reason,
                 delay,
             )
             post_logfile(task, log_path, post_deferred=deferred, delay=delay)
Exemple #7
0
            def result_callback(cburl, cbdata, task, response):
                if 500 <= response.code < 600:
                    logger.error(
                        "Error while marking task %s as failed on master, "
                        "retrying", task["id"])
                    post_update(cburl, cbdata, task, delay=http_retry_delay())

                elif response.code != OK:
                    logger.error(
                        "Could not mark task %s as failed, server response "
                        "code was %s", task["id"], response.code)

                else:
                    logger.info(
                        "Marked task %s as failed on master", task["id"])
Exemple #8
0
    def _download_jobtype(cls, name, version):
        """
        Downloads the job type specified in ``assignment``.  This
        method will pass the response it receives to :meth:`_cache_jobtype`
        however failures will be retried.
        """
        logger.debug("Downloading job type %r version %s", name, version)
        url = str(cls.JOBTYPE_VERSION_URL % {"master_api": config["master_api"], "name": name, "version": version})

        result = Deferred()
        download = lambda *_: get(
            url, callback=result.callback, errback=lambda *_: reactor.callLater(http_retry_delay(), download)
        )
        download()
        return result
Exemple #9
0
 def error_callback(url, log_identifier, failure_reason):
     if failure_reason.type in (ResponseNeverReceived, RequestTransmissionFailed):
         logger.debug(
             "Error while uploading logfile %s to master: " "%s, retrying immediately",
             log_identifier,
             failure_reason.type.__name__,
         )
         upload(url, log_identifier)
     else:
         delay = http_retry_delay()
         logger.error(
             "Error while uploading logfile %s to master: " "%r, retrying in %s seconds.",
             log_identifier,
             failure_reason,
             delay,
         )
         upload(url, log_identifier, delay=delay)
Exemple #10
0
    def _jobtype_download_complete(cls, response, cache_key):
        # Server is offline or experiencing issues right
        # now so we should retry the request.
        if response.code >= INTERNAL_SERVER_ERROR:
            logger.debug("Could not download jobtype because of internal server error.")
            return reactor.callLater(http_retry_delay(), response.request.retry)

        downloaded_data = response.json()

        if not config["jobtype_enable_cache"]:
            logger.debug("Jobtype cache is disabled, loading the jobtype.")
            return cls._load_jobtype(downloaded_data, None)

        else:
            # When the download is complete, cache the results
            logger.debug("Caching the jobtype locally.")
            caching = cls._cache_jobtype(cache_key, downloaded_data)
            caching.addCallback(lambda result: cls._load_jobtype(*result))
            return caching
Exemple #11
0
        def result_callback(task, log_path, deferred, response):
            if 500 <= response.code < 600:
                delay = http_retry_delay()
                logger.error(
                    "Server side error while registering log file %s for "
                    "task %s (frame %s) in job %s (id %s), status code: %s. "
                    "Retrying in %s seconds",
                    log_path,
                    task["id"],
                    task["frame"],
                    self.assignment["job"]["title"],
                    self.assignment["job"]["id"],
                    response.code,
                    delay,
                )
                post_logfile(task, log_path, post_deferred=deferred, delay=delay)

            # The server will return CONFLICT if we try to register a logfile
            # twice.
            elif response.code not in [OK, CONFLICT, CREATED]:
                # Nothing else we could do about that, this is
                # a problem on our end.
                logger.error(
                    "Could not register logfile %s for task %s (frame %s) in "
                    "job %s (id %s), status code: %s. This is a client side "
                    "error, giving up.",
                    log_path,
                    task["id"],
                    task["frame"],
                    self.assignment["job"]["title"],
                    self.assignment["job"]["id"],
                    response.code,
                )
                deferred.errback(None)

            else:
                logger.info("Registered logfile %s for task %s on master", log_path, task["id"])
                deferred.callback(None)
Exemple #12
0
    def post_agent_to_master(self):
        """
        Runs the POST request to contact the master.  Running this method
        multiple times should be considered safe but is generally something
        that should be avoided.
        """
        url = self.agents_endpoint()
        data = self.system_data()

        try:
            response = yield post_direct(url, data=data)
        except Exception as failure:
            delay = http_retry_delay()
            if isinstance(failure, ConnectionRefusedError):
                svclog.error(
                    "Failed to POST agent to master, the connection was "
                    "refused. Retrying in %s seconds", delay)
            else:  # pragma: no cover
                svclog.error(
                    "Unhandled error when trying to POST the agent to the "
                    "master. The error was %s.", failure)

            if not self.shutting_down:
                svclog.info(
                    "Retrying failed POST to master in %s seconds.", delay)
                yield deferLater(reactor, delay, self.post_agent_to_master)
            else:
                svclog.warning("Not retrying POST to master, shutting down.")

        else:
            # Master might be down or have some other internal problems
            # that might eventually be fixed.  Retry the request.
            if response.code >= INTERNAL_SERVER_ERROR:
                if not self.shutting_down:
                    delay = http_retry_delay()
                    svclog.warning(
                        "Failed to post to master due to a server side error "
                        "error %s, retrying in %s seconds",
                        response.code, delay)
                    yield deferLater(reactor, delay, self.post_agent_to_master)
                else:
                    svclog.warning(
                        "Failed to post to master due to a server side error "
                        "error %s. Not retrying, because the agent is "
                        "shutting down", response.code)

            # Master is up but is rejecting our request because there's
            # something wrong with it.  Do not retry the request.
            elif response.code >= BAD_REQUEST:
                text = yield response.text()
                svclog.error(
                    "%s accepted our POST request but responded with code %s "
                    "which is a client side error.  The message the server "
                    "responded with was %r.  Sorry, but we cannot retry this "
                    "request as it's an issue with the agent's request.",
                    url, response.code, text)

            else:
                data = yield treq.json_content(response)
                config["agent_id"] = data["id"]
                config.master_contacted()

                if response.code == OK:
                    svclog.info(
                        "POST to %s was successful. Agent %s was updated.",
                        url, config["agent_id"])

                elif response.code == CREATED:
                    svclog.info(
                        "POST to %s was successful.  A new agent "
                        "with an id of %s was created.",
                        url, config["agent_id"])

                returnValue(data)
Exemple #13
0
    def post_shutdown_to_master(self):
        """
        This method is called before the reactor shuts down and lets the
        master know that the agent's state is now ``offline``
        """
        # We're under the assumption that something's wrong with
        # our code if we try to call this method before self.shutting_down
        # is set.
        assert self.shutting_down
        yield self.post_shutdown_lock.acquire()

        svclog.info("Informing master of shutdown")

        # Because post_shutdown_to_master is blocking and needs to
        # stop the reactor from finishing we perform the retry in-line
        data = None
        tries = 0
        num_retry_errors = 0
        response = None
        timed_out = False
        while True:
            tries += 1
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": AgentState.OFFLINE,
                        "free_ram": memory.free_ram(),
                        "current_assignments": config["current_assignments"]})

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to post shutdown to the master, "
                        "caught try-again errors %s times in a row.",
                        num_retry_errors)
                    break
                elif self.shutdown_timeout < datetime.utcnow():
                    svclog.error("While posting shutdown to master, caught "
                                 "%s. Shutdown timeout has been reached, not "
                                 "retrying.",
                                 error.__class__.__name__)
                    break
                else:
                    svclog.debug("While posting shutdown to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            # When we get a hard failure it could be an issue with the
            # server, although it's unlikely, so we retry.  Only retry
            # for a set period of time though since the shutdown as a timeout
            except Exception as failure:
                if self.shutdown_timeout > datetime.utcnow():
                    delay = http_retry_delay()
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Retrying in %s seconds",
                        failure, delay)

                    # Wait for 'pause' to fire, introducing a delay
                    pause = Deferred()
                    reactor.callLater(delay, pause.callback, None)
                    yield pause

                else:
                    timed_out = True
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Shutdown timeout reached, not retrying.",
                        failure)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == NOT_FOUND:
                    svclog.warning(
                        "Agent %r no longer exists, cannot update state.",
                        config["agent_id"])
                    break

                elif response.code == OK:
                    svclog.info(
                        "Agent %r has POSTed shutdown state change "
                        "successfully.",
                        config["agent_id"])
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if self.shutdown_timeout > datetime.utcnow():
                        delay = http_retry_delay()
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Retrying in %s seconds.",
                            data, delay)

                        # Wait for 'pause' to fire, introducing a delay
                        pause = Deferred()
                        reactor.callLater(delay, pause.callback, None)
                        yield pause
                    else:
                        timed_out = True
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Shutdown timeout reached, not retrying.",
                            data)
                        break

        yield self.post_shutdown_lock.release()
        extra_data = {
            "response": response,
            "timed_out": timed_out,
            "tries": tries,
            "retry_errors": num_retry_errors
        }

        if isinstance(data, dict):
            data.update(extra_data)
        else:
            data = extra_data

        returnValue(data)
Exemple #14
0
    def reannounce(self, force=False):
        """
        Method which is used to periodically contact the master.  This
        method is generally called as part of a scheduled task.
        """
        # Attempt to acquire the reannounce lock but fail after 70%
        # of the total time between reannouncements elapses.  This should
        # help prevent an accumulation of requests in the event the master
        # is having issues.
        try:
            yield self.reannounce_lock.acquire(
                config["agent_master_reannounce"] * .70
            )
        except utility.LockTimeoutError:
            svclog.debug("Timed out while waiting to acquire reannounce_lock")
            returnValue(None)

        if not self.should_reannounce() and not force:
            yield self.reannounce_lock.release()
            returnValue(None)

        svclog.debug("Announcing %s to master", config["agent_hostname"])
        data = None
        num_retry_errors = 0
        while True:  # for retries
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": config["state"],
                        "current_assignments": config.get(
                            "current_assignments", {} # may not be set yet
                        ),
                        "free_ram": memory.free_ram(),
                        "disks": disks.disks(as_dict=True)
                    }
                )

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to announce self to the master, "
                        "caught try-again type errors %s times in a row.",
                        num_retry_errors)
                    break
                else:
                    svclog.debug("While announcing self to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            except Exception as error:
                if force:
                    delay = http_retry_delay()
                    svclog.error(
                        "Failed to announce self to the master: %s.  Will "
                        "retry in %s seconds.", error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred
                else:
                    # Don't retry because reannounce is called periodically
                    svclog.error(
                        "Failed to announce self to the master: %s.  This "
                        "request will not be retried.", error)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == OK:
                    config.master_contacted(announcement=True)
                    svclog.info("Announced self to the master server.")
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if not self.shutting_down:
                        delay = http_retry_delay()
                        svclog.warning(
                            "Could not announce self to the master server, "
                            "internal server error: %s.  Retrying in %s "
                            "seconds.", data, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred
                    else:
                        svclog.warning(
                            "Could not announce to master. Not retrying "
                            "because of pending shutdown.")
                        break

                elif response.code == NOT_FOUND:
                    svclog.warning("The master says it does not know about our "
                                   "agent id. Posting as a new agent.")
                    yield self.post_agent_to_master()
                    break

                # If this is a client problem retrying the request
                # is unlikely to fix the issue so we stop here
                elif response.code >= BAD_REQUEST:
                    svclog.error(
                        "Failed to announce self to the master, bad "
                        "request: %s.  This request will not be retried.",
                        data)
                    break

                else:
                    svclog.error(
                        "Unhandled error when posting self to the "
                        "master: %s (code: %s).  This request will not be "
                        "retried.", data, response.code)
                    break

        yield self.reannounce_lock.release()
        returnValue(data)
 def test_invalid_type_factor(self):
     with self.assertRaises(AssertionError):
         http_retry_delay(factor="")
 def test_invalid_type_offset(self):
     with self.assertRaises(AssertionError):
         http_retry_delay(offset="")
 def test_custom_offset_and_factor(self):
     self.assertEqual(
         http_retry_delay(offset=1, factor=1, rand=lambda: 1), 2)
Exemple #18
0
def get_software_version_data(software, version):
    """
    Asynchronously fetches the known data about the given software version from
    the master.

    :param str software:
        The name of the software to get data for

    :param str version:
        The name of the version to get data for

    :return:
        Returns information about the given software version from
        the master
    """
    url = "{master_api}/software/{software}/versions/{version}".\
        format(master_api=config.get("master_api"),
               software=software, version=version)

    while True:
        try:
            response = yield get_direct(url)

        except Exception as error:
            delay = http_retry_delay()
            logger.error(
                "Failed to get data about software %s, version %s: %r.  Will "
                "retry in %s seconds.", software, version, error, delay)
            deferred = Deferred()
            reactor.callLater(delay, deferred.callback, None)
            yield deferred

        else:
            if response.code == OK:
                data = yield treq.json_content(response)
                returnValue(data)

            elif response.code >= INTERNAL_SERVER_ERROR:
                delay = http_retry_delay()
                logger.warning(
                    "Could not get data for software %s, version %s, server "
                    "responded with INTERNAL_SERVER_ERROR.  Retrying in %s "
                    "seconds.", software, version, delay)

                deferred = Deferred()
                reactor.callLater(delay, deferred.callback, None)
                yield deferred

            elif response.code == NOT_FOUND:
                logger.error("Got 404 NOT FOUND from server on getting data "
                             "for software %s, version %s", software, version)
                raise VersionNotFound("This software version was not found or "
                                      "has no discovery code.")

            else:
                logger.error(
                    "Failed to get data for software %s, version %s: "
                    "Unexpected status from server %s", software, version,
                    response.code)
                raise Exception("Unknown return code from master: %s" %
                                response.code)
Exemple #19
0
 def error_callback(cburl, cbdata, task, failure_reason):
     logger.error(
         "Error while marking task %s as failed, retrying",
         task["id"], failure_reason)
     post_update(cburl, cbdata, task, delay=http_retry_delay())