Esempio n. 1
0
    def prepare_config(self):
        for key in self._pop_config_keys:
            config.pop(key, None)

        config.update(
            {
                # Default to a uniform retry delay of one second
                "agent_http_retry_delay_offset": 0.1,
                "agent_http_retry_delay_factor": 0,
                "shutting_down": False,
                "jobtypes": {},
                "current_assignments": {},
                "agent_id": uuid.uuid4(),
                "agent_http_persistent_connections": False,
                "agent_shutdown_timeout": 3,
                "master": PYFARM_AGENT_MASTER,
                "agent_hostname": os.urandom(self.RAND_LENGTH).encode("hex"),
                "agent_ram": memory.total_ram(),
                "agent_cpus": cpu.total_cpus(),
                "agent_api_port": randint(10000, 50000),
                "free_ram": memory.free_ram(),
                "agent_time_offset": randint(-50, 50),
                "state": choice(AgentState),
                "start": time.time(),
                "agent_pretty_json": False,
                "agent_html_template_reload": True,
                "agent_master_reannounce": randint(5, 15),
            }
        )
Esempio n. 2
0
    def get(self, **_):
        # Get counts for child processes and grandchild processes
        process = psutil.Process()
        direct_child_processes = len(process.children(recursive=False))
        all_child_processes = len(process.children(recursive=True))
        grandchild_processes = all_child_processes - direct_child_processes

        # Determine the last time we talked to the master (if ever)
        contacted = config.master_contacted(update=False)
        if isinstance(contacted, datetime):  # pragma: no cover
            contacted = datetime.utcnow() - contacted

        # Determine the last time we announced ourselves to the
        # master (if ever)
        last_announce = config.get("last_announce", None)
        if isinstance(last_announce, datetime):  # pragma: no cover
            last_announce = datetime.utcnow() - last_announce

        data = {"state": config["state"],
                "agent_hostname": config["agent_hostname"],
                "free_ram": memory.free_ram(),
                "agent_process_ram": memory.process_memory(),
                "consumed_ram": memory.total_consumption(),
                "child_processes": direct_child_processes,
                "grandchild_processes": grandchild_processes,
                "pids": config["pids"],
                "agent_id": config["agent_id"],
                "last_master_contact": contacted,
                "last_announce": last_announce,
                "agent_lock_file": config["agent_lock_file"],
                "uptime": total_seconds(
                    timedelta(seconds=time.time() - config["start"])),
                "jobs": list(config["jobtypes"].keys())}

        if config["farm_name"]:
            data["farm_name"] = config["farm_name"]

        return dumps(data)
Esempio n. 3
0
    def callback_agent_id_set(
            self, change_type, key, new_value, old_value, shutdown_events=True):
        """
        When `agent_id` is created we need to:

            * Register a shutdown event so that when the agent is told to
              shutdown it will notify the master of a state change.
            * Star the scheduled task manager
        """
        if key == "agent_id" and change_type == config.CREATED \
                and not self.register_shutdown_events:
            if shutdown_events:
                self.register_shutdown_events = True

            # set the initial free_ram
            config["free_ram"] = memory.free_ram()

            config.master_contacted()
            svclog.debug(
                "`%s` was %s, adding system event trigger for shutdown",
                key, change_type)

            self.repeating_call(
                config["agent_master_reannounce"], self.reannounce)
Esempio n. 4
0
    def post_shutdown_to_master(self):
        """
        This method is called before the reactor shuts down and lets the
        master know that the agent's state is now ``offline``
        """
        # We're under the assumption that something's wrong with
        # our code if we try to call this method before self.shutting_down
        # is set.
        assert self.shutting_down
        yield self.post_shutdown_lock.acquire()

        svclog.info("Informing master of shutdown")

        # Because post_shutdown_to_master is blocking and needs to
        # stop the reactor from finishing we perform the retry in-line
        data = None
        tries = 0
        num_retry_errors = 0
        response = None
        timed_out = False
        while True:
            tries += 1
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": AgentState.OFFLINE,
                        "free_ram": memory.free_ram(),
                        "current_assignments": config["current_assignments"]})

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to post shutdown to the master, "
                        "caught try-again errors %s times in a row.",
                        num_retry_errors)
                    break
                elif self.shutdown_timeout < datetime.utcnow():
                    svclog.error("While posting shutdown to master, caught "
                                 "%s. Shutdown timeout has been reached, not "
                                 "retrying.",
                                 error.__class__.__name__)
                    break
                else:
                    svclog.debug("While posting shutdown to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            # When we get a hard failure it could be an issue with the
            # server, although it's unlikely, so we retry.  Only retry
            # for a set period of time though since the shutdown as a timeout
            except Exception as failure:
                if self.shutdown_timeout > datetime.utcnow():
                    delay = http_retry_delay()
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Retrying in %s seconds",
                        failure, delay)

                    # Wait for 'pause' to fire, introducing a delay
                    pause = Deferred()
                    reactor.callLater(delay, pause.callback, None)
                    yield pause

                else:
                    timed_out = True
                    svclog.warning(
                        "State update failed due to unhandled error: %s.  "
                        "Shutdown timeout reached, not retrying.",
                        failure)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == NOT_FOUND:
                    svclog.warning(
                        "Agent %r no longer exists, cannot update state.",
                        config["agent_id"])
                    break

                elif response.code == OK:
                    svclog.info(
                        "Agent %r has POSTed shutdown state change "
                        "successfully.",
                        config["agent_id"])
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if self.shutdown_timeout > datetime.utcnow():
                        delay = http_retry_delay()
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Retrying in %s seconds.",
                            data, delay)

                        # Wait for 'pause' to fire, introducing a delay
                        pause = Deferred()
                        reactor.callLater(delay, pause.callback, None)
                        yield pause
                    else:
                        timed_out = True
                        svclog.warning(
                            "State update failed due to server error: %s.  "
                            "Shutdown timeout reached, not retrying.",
                            data)
                        break

        yield self.post_shutdown_lock.release()
        extra_data = {
            "response": response,
            "timed_out": timed_out,
            "tries": tries,
            "retry_errors": num_retry_errors
        }

        if isinstance(data, dict):
            data.update(extra_data)
        else:
            data = extra_data

        returnValue(data)
Esempio n. 5
0
    def system_data(self, requery_timeoffset=False):
        """
        Returns a dictionary of data containing information about the
        agent.  This is the information that is also passed along to
        the master.
        """
        # query the time offset and then cache it since
        # this is typically a blocking operation
        if config["agent_time_offset"] == "auto":
            config["agent_time_offset"] = None

        if requery_timeoffset or config["agent_time_offset"] is None:
            ntplog.info(
                "Querying ntp server %r for current time",
                config["agent_ntp_server"])

            ntp_client = NTPClient()
            try:
                pool_time = ntp_client.request(
                    config["agent_ntp_server"],
                    version=config["agent_ntp_server_version"])

            except Exception as e:
                ntplog.warning("Failed to determine network time: %s", e)

            else:
                config["agent_time_offset"] = \
                    int(pool_time.tx_time - time.time())

                # format the offset for logging purposes
                utcoffset = datetime.utcfromtimestamp(pool_time.tx_time)
                iso_timestamp = utcoffset.isoformat()
                ntplog.debug(
                    "network time: %s (local offset: %r)",
                    iso_timestamp, config["agent_time_offset"])

                if config["agent_time_offset"] != 0:
                    ntplog.warning(
                        "Agent is %r second(s) off from ntp server at %r",
                        config["agent_time_offset"],
                        config["agent_ntp_server"])

        data = {
            "id": config["agent_id"],
            "hostname": config["agent_hostname"],
            "version": config.version,
            "os_class": system.operating_system(),
            "os_fullname": platform(),
            "ram": int(config["agent_ram"]),
            "cpus": config["agent_cpus"],
            "cpu_name": cpu.cpu_name(),
            "port": config["agent_api_port"],
            "free_ram": memory.free_ram(),
            "time_offset": config["agent_time_offset"] or 0,
            "state": config["state"],
            "mac_addresses": list(network.mac_addresses()),
            "current_assignments": config.get(
                "current_assignments", {}), # may not be set yet
            "disks": disks.disks(as_dict=True)
        }

        try:
            gpu_names = graphics.graphics_cards()
            data["gpus"] = gpu_names
        except graphics.GPULookupError:
            pass

        if "remote_ip" in config:
            data.update(remote_ip=config["remote_ip"])

        if config["farm_name"]:
            data["farm_name"] = config["farm_name"]

        return data
Esempio n. 6
0
    def reannounce(self, force=False):
        """
        Method which is used to periodically contact the master.  This
        method is generally called as part of a scheduled task.
        """
        # Attempt to acquire the reannounce lock but fail after 70%
        # of the total time between reannouncements elapses.  This should
        # help prevent an accumulation of requests in the event the master
        # is having issues.
        try:
            yield self.reannounce_lock.acquire(
                config["agent_master_reannounce"] * .70
            )
        except utility.LockTimeoutError:
            svclog.debug("Timed out while waiting to acquire reannounce_lock")
            returnValue(None)

        if not self.should_reannounce() and not force:
            yield self.reannounce_lock.release()
            returnValue(None)

        svclog.debug("Announcing %s to master", config["agent_hostname"])
        data = None
        num_retry_errors = 0
        while True:  # for retries
            try:
                response = yield post_direct(
                    self.agent_api(),
                    data={
                        "state": config["state"],
                        "current_assignments": config.get(
                            "current_assignments", {} # may not be set yet
                        ),
                        "free_ram": memory.free_ram(),
                        "disks": disks.disks(as_dict=True)
                    }
                )

            except (ResponseNeverReceived, RequestTransmissionFailed) as error:
                num_retry_errors += 1
                if num_retry_errors > config["broken_connection_max_retry"]:
                    svclog.error(
                        "Failed to announce self to the master, "
                        "caught try-again type errors %s times in a row.",
                        num_retry_errors)
                    break
                else:
                    svclog.debug("While announcing self to master, caught "
                                 "%s. Retrying immediately.",
                                 error.__class__.__name__)
            except Exception as error:
                if force:
                    delay = http_retry_delay()
                    svclog.error(
                        "Failed to announce self to the master: %s.  Will "
                        "retry in %s seconds.", error, delay)
                    deferred = Deferred()
                    reactor.callLater(delay, deferred.callback, None)
                    yield deferred
                else:
                    # Don't retry because reannounce is called periodically
                    svclog.error(
                        "Failed to announce self to the master: %s.  This "
                        "request will not be retried.", error)
                    break

            else:
                data = yield treq.json_content(response)
                if response.code == OK:
                    config.master_contacted(announcement=True)
                    svclog.info("Announced self to the master server.")
                    break

                elif response.code >= INTERNAL_SERVER_ERROR:
                    if not self.shutting_down:
                        delay = http_retry_delay()
                        svclog.warning(
                            "Could not announce self to the master server, "
                            "internal server error: %s.  Retrying in %s "
                            "seconds.", data, delay)

                        deferred = Deferred()
                        reactor.callLater(delay, deferred.callback, None)
                        yield deferred
                    else:
                        svclog.warning(
                            "Could not announce to master. Not retrying "
                            "because of pending shutdown.")
                        break

                elif response.code == NOT_FOUND:
                    svclog.warning("The master says it does not know about our "
                                   "agent id. Posting as a new agent.")
                    yield self.post_agent_to_master()
                    break

                # If this is a client problem retrying the request
                # is unlikely to fix the issue so we stop here
                elif response.code >= BAD_REQUEST:
                    svclog.error(
                        "Failed to announce self to the master, bad "
                        "request: %s.  This request will not be retried.",
                        data)
                    break

                else:
                    svclog.error(
                        "Unhandled error when posting self to the "
                        "master: %s (code: %s).  This request will not be "
                        "retried.", data, response.code)
                    break

        yield self.reannounce_lock.release()
        returnValue(data)
Esempio n. 7
0
 def test_ramfree(self):
     v1 = convert.bytetomb(psutil.virtual_memory().available)
     v2 = memory.free_ram()
     self.assertApproximates(v1, v2, 5)
Esempio n. 8
0
 def test_ramused(self):
     v1 = memory.total_ram() - memory.free_ram()
     v2 = memory.used_ram()
     self.assertApproximates(v1, v2, 5)
Esempio n. 9
0
    def post(self, **kwargs):
        if request_from_master(kwargs["request"]):
            config.master_contacted()

        request = kwargs["request"]
        request_data = kwargs["data"]

        # First, get the resources we have *right now*.  In some cases
        # this means using the functions in pyfarm.core.sysinfo because
        # entries in `config` could be slightly out of sync with the system.
        memory_free = free_ram()
        cpus = config["agent_cpus"]
        requires_ram = request_data["job"].get("ram")
        requires_cpus = request_data["job"].get("cpus")

        if ("agent_id" in request_data and
            request_data["agent_id"] != config["agent_id"]):
            logger.error("Wrong agent_id in assignment: %s. Our id is %s",
                         request_data["agent_id"], config["agent_id"])
            return (
                dumps({"error": "You have the wrong agent. "
                                "I am %s." % config["agent_id"],
                       "agent_id": config["agent_id"]}),
                BAD_REQUEST
            )

        elif self.agent.reannounce_lock.locked:
            logger.warning("Temporarily rejecting assignment because we "
                           "are in the middle of a reannounce.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "reannounce in progress. Try again shortly."}),
                SERVICE_UNAVAILABLE
            )

        elif self.agent.shutting_down:
            logger.error("Rejecting assignment because the agent is in the "
                         "process of shutting down.")
            return (
                dumps({"error": "Agent cannot accept assignments because it is "
                                "shutting down."}),
                SERVICE_UNAVAILABLE
            )

        elif "restart_requested" in config \
                and config["restart_requested"] is True:
            logger.error("Rejecting assignment because of scheduled restart.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "pending restart."}),
                SERVICE_UNAVAILABLE
            )

        elif "agent_id" not in config:
            logger.error(
                "Agent has not yet connected to the master or `agent_id` "
                "has not been set yet.")
            return (
                dumps({"error": "agent_id has not been set in the config"}),
                SERVICE_UNAVAILABLE
            )

        # Do we have enough ram?
        elif requires_ram is not None and requires_ram > memory_free:
            logger.error(
                "Task %s requires %sMB of ram, this agent has %sMB free.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_ram, memory_free,
                request_data["job"]["id"])
            config["free_ram"] = memory_free
            return (
                dumps({"error": "Not enough ram",
                       "agent_ram": memory_free,
                       "requires_ram": requires_ram}),
                BAD_REQUEST
            )

        # Do we have enough cpus (count wise)?
        elif requires_cpus is not None and requires_cpus > cpus:
            logger.error(
                "Task %s requires %s CPUs, this agent has %s CPUs.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_cpus, cpus,
                request_data["job"]["id"])
            return (
                dumps({"error": "Not enough cpus",
                       "agent_cpus": cpus,
                       "requires_cpus": requires_cpus}),
                BAD_REQUEST
            )

        new_task_ids = set(task["id"] for task in request_data["tasks"])

        for assignment in config["current_assignments"].itervalues():
            existing_task_ids = set(x["id"] for x in assignment["tasks"])

            # If the assignment is identical to one we already have
            if existing_task_ids == new_task_ids:
                logger.debug(
                    "Ignoring repeated assignment of the same batch")
                return dumps({"id": assignment["id"]}), ACCEPTED

            # If there is only a partial overlap
            elif existing_task_ids & new_task_ids:
                logger.error("Rejecting assignment with partial overlap with "
                             "existing assignment.")
                unknown_task_ids = new_task_ids - existing_task_ids
                return (
                    dumps({"error": "Partial overlap of tasks",
                           "rejected_task_ids": list(unknown_task_ids)}),
                    CONFLICT
                )

        if not config["agent_allow_sharing"]:
            for jobtype in config["jobtypes"].itervalues():
                num_finished_tasks = (len(jobtype.finished_tasks) +
                                      len(jobtype.failed_tasks))
                if len(jobtype.assignment["tasks"]) > num_finished_tasks:
                    logger.error("Rejecting an assignment that would require "
                                 "agent sharing")
                    return (
                        dumps({
                            "error": "Agent does not allow multiple "
                                     "assignments",
                            "rejected_task_ids": list(new_task_ids)}),
                        CONFLICT
                    )

        assignment_uuid = uuid4()
        request_data.update(id=assignment_uuid)
        config["current_assignments"][assignment_uuid] = request_data
        logger.debug("Accepted assignment %s: %r",
                     assignment_uuid, request_data)
        logger.info("Accept assignment from job %s with %s tasks",
                    request_data["job"]["title"], len(request_data["tasks"]))

        def assignment_failed(result, assign_id):
            logger.error(
                "Assignment %s failed, result: %r, removing.", assign_id, result)
            logger.error(result.getTraceback())
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            # Do not mark the assignment as failed if the reason for failing
            # was that we ran out of disk space
            failed = not isinstance(result.value, InsufficientSpaceError)
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    instance = config["jobtypes"].pop(jobtype_id, None)
                    instance.stop(
                        assignment_failed=failed,
                        avoid_reassignment=not failed,
                        error="Error in jobtype: %r. "
                              "Traceback: %s" % (result,
                                                 traceback.format_exc()))

        def assignment_started(_, assign_id):
            logger.debug("Assignment %s has started", assign_id)
            config["state"] = AgentState.RUNNING
            self.agent.reannounce(force=True)

        def remove_assignment(_, assign_id):
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def assignment_stopped(_, assign_id):
            logger.debug("Assignment %s has stopped", assign_id)
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            assignment = config["current_assignments"][assign_id]
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    jobtype = config["jobtypes"].pop(jobtype_id, None)
                    updates_deferred = DeferredList(
                        jobtype.task_update_deferreds)
                    updates_deferred.addBoth(remove_assignment, assign_id)
            else:
                config["current_assignments"].pop(assign_id)

        def restart_if_necessary(_):  # pragma: no cover
            if "restart_requested" in config and config["restart_requested"]:
                stopping = config["agent"].stop()
                stopping.addCallbacks(lambda _: reactor.stop(),
                                      lambda _: reactor.stop())

        def load_jobtype_failed(result, assign_id):
            logger.error(
                "Loading jobtype for assignment %s failed, removing.", assign_id)
            traceback = result.getTraceback()
            logger.debug("Got traceback")
            logger.error(traceback)
            assignment = config["current_assignments"].pop(assign_id)

            # Mark all tasks as failed on master and set an error message
            logger.debug("Marking tasks in assignment as failed")
            def post_update(post_url, post_data, task, delay=0):
                post_func = partial(post, post_url, data=post_data,
                    callback=lambda x: result_callback(
                        post_url, post_data, task, x),
                    errback=lambda x: error_callback(
                        post_url, post_data, task, x))
                reactor.callLater(delay, post_func)

            def result_callback(cburl, cbdata, task, response):
                if 500 <= response.code < 600:
                    logger.error(
                        "Error while marking task %s as failed on master, "
                        "retrying", task["id"])
                    post_update(cburl, cbdata, task, delay=http_retry_delay())

                elif response.code != OK:
                    logger.error(
                        "Could not mark task %s as failed, server response "
                        "code was %s", task["id"], response.code)

                else:
                    logger.info(
                        "Marked task %s as failed on master", task["id"])

            def error_callback(cburl, cbdata, task, failure_reason):
                logger.error(
                    "Error while marking task %s as failed, retrying",
                    task["id"], failure_reason)
                post_update(cburl, cbdata, task, delay=http_retry_delay())

            for task in assignment["tasks"]:
                url = "%s/jobs/%s/tasks/%s" % (
                    config["master_api"], assignment["job"]["id"], task["id"])
                data = {
                    "state": WorkState.FAILED,
                    "last_error": traceback}
                post_update(url, data, task)

            # If the loading was partially successful for some reason, there
            # might already be an entry for this jobtype in the config.
            # Remove it if it exists.
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def loaded_jobtype(jobtype_class, assign_id):
            # TODO: report error to master
            if hasattr(jobtype_class, "getTraceback"):
                logger.error(jobtype_class.getTraceback())
                return

            # TODO: add call to prepare_for_job
            # TODO: add call to spawn_persistent_process

            # Instance the job type and pass in the assignment data.
            instance = jobtype_class(request_data)

            if not isinstance(instance, JobType):
                raise TypeError(
                    "Expected a subclass of "
                    "pyfarm.jobtypes.core.jobtype.JobType")

            # TODO: add callback to cleanup_after_job
            # TODO: add callback to stop persistent process
            try:
                started_deferred, stopped_deferred = instance._start()
                started_deferred.addCallback(assignment_started, assign_id)
                started_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addCallback(assignment_stopped, assign_id)
                stopped_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addBoth(restart_if_necessary)
                stopped_deferred.addBoth(
                    lambda *args: instance._remove_tempdirs())
                stopped_deferred.addBoth(
                    lambda *args: instance._close_logs())
                stopped_deferred.addBoth(
                    lambda *args: instance._upload_logfile())
            except Exception as e:
                logger.error("Error on starting jobtype, stopping it now.  "
                             "Error was: %r. Traceback: %s", e,
                             traceback.format_exc())
                instance.stop(assignment_failed=True,
                              error="Error while loading jobtype: %r. "
                                    "Traceback: %s" %
                                    (e, traceback.format_exc()))
                assignment = config["current_assignments"].pop(assign_id)
                if "jobtype" in assignment:
                    jobtype_id = assignment["jobtype"].pop("id", None)
                    if jobtype_id:
                        config["jobtypes"].pop(jobtype_id, None)

        # Load the job type then pass the class along to the
        # callback.  No errback here because all the errors
        # are handled internally in this case.
        jobtype_loader = JobType.load(request_data)
        jobtype_loader.addCallback(loaded_jobtype, assignment_uuid)
        jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid)

        return dumps({"id": assignment_uuid}), ACCEPTED