Ejemplo n.º 1
0
    def get(self, **kwargs):
        request = kwargs.get("request")

        if request is not None and request_from_master(request):
            config.master_contacted()

        return dumps(versions=[1]), OK
Ejemplo n.º 2
0
 def test_request_not_from_master(self):
     request = DummyRequest("")
     request.requestHeaders.setRawHeaders(
         "User-Agent", ["foobar"])
     self.assertFalse(request_from_master(request))
Ejemplo n.º 3
0
    def get(self, **kwargs):
        """
        Get the contents of the specified task log.  The log will be
        returned in CSV format with the following fields:

            * ISO8601 timestamp
            * Stream number (0 == stdout, 1 == stderr)
            * Line number
            * Parent process ID
            * Message the process produced

        The log file identifier is configurable and relies on the
        `jobtype_task_log_filename` configuration option.  See the
        configuration documentation for more information about the
        default value.

        .. http:get:: /api/v1/tasklogs/<identifier> HTTP/1.1

            **Request**

            .. sourcecode:: http

                GET /api/v1/tasklogs/<identifier> HTTP/1.1

            **Response**

            .. sourcecode:: http

                HTTP/1.1 200 OK
                Content-Type: text/csv

                2015-05-07T23:42:53.730975,0,15,42,Hello world

        :statuscode 200: The log file was found, it's content will be returned.
        """
        request = kwargs["request"]

        if request_from_master(request):
            config.master_contacted()

        if len(request.postpath) != 1:
            return (
                dumps({"error": "Did not specify a log identifier"}),
                BAD_REQUEST,
                {"Content-Type": ["application/json"]}
            )

        log_identifier = request.postpath[0]
        if "/" in log_identifier or "\\" in log_identifier:
            return (
                dumps({"error": "log_identifier must not contain "
                                "directory separators"}),
                BAD_REQUEST,
                {"Content-Type": ["application/json"]}
            )

        path = join(config["jobtype_task_logs"], log_identifier)

        try:
            logfile = self._open_file(path, "rb")
        except Exception as error:
            request.responseHeaders.setRawHeaders(
                "Content-Type", ["application/json"])

            if getattr(error, "errno", None) == ENOENT:
                return dumps({"error": "%s does not exist" % path}), NOT_FOUND

            logger.error("GET %s failed: %s", request.postpath, error)
            return dumps({"error": str(error)}), INTERNAL_SERVER_ERROR

        # TODO: deferToThread for open? (and possibly send)
        request.setResponseCode(OK)
        request.responseHeaders.setRawHeaders("Content-Type", ["text/csv"])
        deferred = FileSender().beginFileTransfer(logfile, request)
        deferred.addCallback(lambda *_: request.finish())
        deferred.addCallback(lambda *_: logfile.close())

        return NOT_DONE_YET
Ejemplo n.º 4
0
 def test_request_from_master(self):
     request = DummyRequest("")
     request.requestHeaders.setRawHeaders(
         "User-Agent", [config["master_user_agent"]])
     self.assertTrue(request_from_master(request))
Ejemplo n.º 5
0
    def post(self, **kwargs):
        if request_from_master(kwargs["request"]):
            config.master_contacted()

        request = kwargs["request"]
        request_data = kwargs["data"]

        # First, get the resources we have *right now*.  In some cases
        # this means using the functions in pyfarm.core.sysinfo because
        # entries in `config` could be slightly out of sync with the system.
        memory_free = free_ram()
        cpus = config["agent_cpus"]
        requires_ram = request_data["job"].get("ram")
        requires_cpus = request_data["job"].get("cpus")

        if ("agent_id" in request_data and
            request_data["agent_id"] != config["agent_id"]):
            logger.error("Wrong agent_id in assignment: %s. Our id is %s",
                         request_data["agent_id"], config["agent_id"])
            return (
                dumps({"error": "You have the wrong agent. "
                                "I am %s." % config["agent_id"],
                       "agent_id": config["agent_id"]}),
                BAD_REQUEST
            )

        elif self.agent.reannounce_lock.locked:
            logger.warning("Temporarily rejecting assignment because we "
                           "are in the middle of a reannounce.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "reannounce in progress. Try again shortly."}),
                SERVICE_UNAVAILABLE
            )

        elif self.agent.shutting_down:
            logger.error("Rejecting assignment because the agent is in the "
                         "process of shutting down.")
            return (
                dumps({"error": "Agent cannot accept assignments because it is "
                                "shutting down."}),
                SERVICE_UNAVAILABLE
            )

        elif "restart_requested" in config \
                and config["restart_requested"] is True:
            logger.error("Rejecting assignment because of scheduled restart.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "pending restart."}),
                SERVICE_UNAVAILABLE
            )

        elif "agent_id" not in config:
            logger.error(
                "Agent has not yet connected to the master or `agent_id` "
                "has not been set yet.")
            return (
                dumps({"error": "agent_id has not been set in the config"}),
                SERVICE_UNAVAILABLE
            )

        # Do we have enough ram?
        elif requires_ram is not None and requires_ram > memory_free:
            logger.error(
                "Task %s requires %sMB of ram, this agent has %sMB free.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_ram, memory_free,
                request_data["job"]["id"])
            config["free_ram"] = memory_free
            return (
                dumps({"error": "Not enough ram",
                       "agent_ram": memory_free,
                       "requires_ram": requires_ram}),
                BAD_REQUEST
            )

        # Do we have enough cpus (count wise)?
        elif requires_cpus is not None and requires_cpus > cpus:
            logger.error(
                "Task %s requires %s CPUs, this agent has %s CPUs.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_cpus, cpus,
                request_data["job"]["id"])
            return (
                dumps({"error": "Not enough cpus",
                       "agent_cpus": cpus,
                       "requires_cpus": requires_cpus}),
                BAD_REQUEST
            )

        new_task_ids = set(task["id"] for task in request_data["tasks"])

        for assignment in config["current_assignments"].itervalues():
            existing_task_ids = set(x["id"] for x in assignment["tasks"])

            # If the assignment is identical to one we already have
            if existing_task_ids == new_task_ids:
                logger.debug(
                    "Ignoring repeated assignment of the same batch")
                return dumps({"id": assignment["id"]}), ACCEPTED

            # If there is only a partial overlap
            elif existing_task_ids & new_task_ids:
                logger.error("Rejecting assignment with partial overlap with "
                             "existing assignment.")
                unknown_task_ids = new_task_ids - existing_task_ids
                return (
                    dumps({"error": "Partial overlap of tasks",
                           "rejected_task_ids": list(unknown_task_ids)}),
                    CONFLICT
                )

        if not config["agent_allow_sharing"]:
            for jobtype in config["jobtypes"].itervalues():
                num_finished_tasks = (len(jobtype.finished_tasks) +
                                      len(jobtype.failed_tasks))
                if len(jobtype.assignment["tasks"]) > num_finished_tasks:
                    logger.error("Rejecting an assignment that would require "
                                 "agent sharing")
                    return (
                        dumps({
                            "error": "Agent does not allow multiple "
                                     "assignments",
                            "rejected_task_ids": list(new_task_ids)}),
                        CONFLICT
                    )

        assignment_uuid = uuid4()
        request_data.update(id=assignment_uuid)
        config["current_assignments"][assignment_uuid] = request_data
        logger.debug("Accepted assignment %s: %r",
                     assignment_uuid, request_data)
        logger.info("Accept assignment from job %s with %s tasks",
                    request_data["job"]["title"], len(request_data["tasks"]))

        def assignment_failed(result, assign_id):
            logger.error(
                "Assignment %s failed, result: %r, removing.", assign_id, result)
            logger.error(result.getTraceback())
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            # Do not mark the assignment as failed if the reason for failing
            # was that we ran out of disk space
            failed = not isinstance(result.value, InsufficientSpaceError)
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    instance = config["jobtypes"].pop(jobtype_id, None)
                    instance.stop(
                        assignment_failed=failed,
                        avoid_reassignment=not failed,
                        error="Error in jobtype: %r. "
                              "Traceback: %s" % (result,
                                                 traceback.format_exc()))

        def assignment_started(_, assign_id):
            logger.debug("Assignment %s has started", assign_id)
            config["state"] = AgentState.RUNNING
            self.agent.reannounce(force=True)

        def remove_assignment(_, assign_id):
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def assignment_stopped(_, assign_id):
            logger.debug("Assignment %s has stopped", assign_id)
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            assignment = config["current_assignments"][assign_id]
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    jobtype = config["jobtypes"].pop(jobtype_id, None)
                    updates_deferred = DeferredList(
                        jobtype.task_update_deferreds)
                    updates_deferred.addBoth(remove_assignment, assign_id)
            else:
                config["current_assignments"].pop(assign_id)

        def restart_if_necessary(_):  # pragma: no cover
            if "restart_requested" in config and config["restart_requested"]:
                stopping = config["agent"].stop()
                stopping.addCallbacks(lambda _: reactor.stop(),
                                      lambda _: reactor.stop())

        def load_jobtype_failed(result, assign_id):
            logger.error(
                "Loading jobtype for assignment %s failed, removing.", assign_id)
            traceback = result.getTraceback()
            logger.debug("Got traceback")
            logger.error(traceback)
            assignment = config["current_assignments"].pop(assign_id)

            # Mark all tasks as failed on master and set an error message
            logger.debug("Marking tasks in assignment as failed")
            def post_update(post_url, post_data, task, delay=0):
                post_func = partial(post, post_url, data=post_data,
                    callback=lambda x: result_callback(
                        post_url, post_data, task, x),
                    errback=lambda x: error_callback(
                        post_url, post_data, task, x))
                reactor.callLater(delay, post_func)

            def result_callback(cburl, cbdata, task, response):
                if 500 <= response.code < 600:
                    logger.error(
                        "Error while marking task %s as failed on master, "
                        "retrying", task["id"])
                    post_update(cburl, cbdata, task, delay=http_retry_delay())

                elif response.code != OK:
                    logger.error(
                        "Could not mark task %s as failed, server response "
                        "code was %s", task["id"], response.code)

                else:
                    logger.info(
                        "Marked task %s as failed on master", task["id"])

            def error_callback(cburl, cbdata, task, failure_reason):
                logger.error(
                    "Error while marking task %s as failed, retrying",
                    task["id"], failure_reason)
                post_update(cburl, cbdata, task, delay=http_retry_delay())

            for task in assignment["tasks"]:
                url = "%s/jobs/%s/tasks/%s" % (
                    config["master_api"], assignment["job"]["id"], task["id"])
                data = {
                    "state": WorkState.FAILED,
                    "last_error": traceback}
                post_update(url, data, task)

            # If the loading was partially successful for some reason, there
            # might already be an entry for this jobtype in the config.
            # Remove it if it exists.
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def loaded_jobtype(jobtype_class, assign_id):
            # TODO: report error to master
            if hasattr(jobtype_class, "getTraceback"):
                logger.error(jobtype_class.getTraceback())
                return

            # TODO: add call to prepare_for_job
            # TODO: add call to spawn_persistent_process

            # Instance the job type and pass in the assignment data.
            instance = jobtype_class(request_data)

            if not isinstance(instance, JobType):
                raise TypeError(
                    "Expected a subclass of "
                    "pyfarm.jobtypes.core.jobtype.JobType")

            # TODO: add callback to cleanup_after_job
            # TODO: add callback to stop persistent process
            try:
                started_deferred, stopped_deferred = instance._start()
                started_deferred.addCallback(assignment_started, assign_id)
                started_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addCallback(assignment_stopped, assign_id)
                stopped_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addBoth(restart_if_necessary)
                stopped_deferred.addBoth(
                    lambda *args: instance._remove_tempdirs())
                stopped_deferred.addBoth(
                    lambda *args: instance._close_logs())
                stopped_deferred.addBoth(
                    lambda *args: instance._upload_logfile())
            except Exception as e:
                logger.error("Error on starting jobtype, stopping it now.  "
                             "Error was: %r. Traceback: %s", e,
                             traceback.format_exc())
                instance.stop(assignment_failed=True,
                              error="Error while loading jobtype: %r. "
                                    "Traceback: %s" %
                                    (e, traceback.format_exc()))
                assignment = config["current_assignments"].pop(assign_id)
                if "jobtype" in assignment:
                    jobtype_id = assignment["jobtype"].pop("id", None)
                    if jobtype_id:
                        config["jobtypes"].pop(jobtype_id, None)

        # Load the job type then pass the class along to the
        # callback.  No errback here because all the errors
        # are handled internally in this case.
        jobtype_loader = JobType.load(request_data)
        jobtype_loader.addCallback(loaded_jobtype, assignment_uuid)
        jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid)

        return dumps({"id": assignment_uuid}), ACCEPTED