Esempio n. 1
0
    def error(self, request, code, message):
        """
        Writes the proper out an error response message depending on the
        content type in the request
        """
        response_types = self.get_accept(request)
        logger.error(message)

        if "text/html" in response_types:
            request.setResponseCode(code)
            html_error = template.load("error.html")
            result = html_error.render(
                code=code, code_msg=responses[code], message=message)
            request.write(result.encode())

        elif "application/json" in response_types:
            request.setResponseCode(code)
            request.write(dumps({"error": message}))

        else:
            request.setResponseCode(UNSUPPORTED_MEDIA_TYPE)
            error = dumps(
                {"error":
                     "Can only handle one of %s here" % self.ALLOWED_ACCEPT})
            request.write(error)

        request.finish()
Esempio n. 2
0
    def get(self, **kwargs):
        request = kwargs.get("request")

        if request is not None and request_from_master(request):
            config.master_contacted()

        return dumps(versions=[1]), OK
Esempio n. 3
0
    def request(self, method, **kwargs):
        data = kwargs.pop("data", None)
        headers = kwargs.pop("headers", {})
        uri = kwargs.pop("uri", self.URI)

        request = DummyRequest(uri)
        request.method = method.upper()

        if data is not None:
            request.content = StringIO()
            request.content.write(dumps(data))
            request.content.seek(0)

        if self.DEFAULT_HEADERS is not NotImplemented:
            headers.update(self.DEFAULT_HEADERS)

        if headers:
            self.failUnlessIsInstance(headers, dict)
            for key, value in headers.items():
                if isinstance(value, STRING_TYPES):
                    value = [value]

                self.failUnlessIsInstance(value, list)
                request.requestHeaders.setRawHeaders(key, value)

        self.failUnlessEqual(kwargs, {}, "Unknown keywords %s" % kwargs.keys())
        return request
Esempio n. 4
0
    def get(self, **_):
        # Get counts for child processes and grandchild processes
        process = psutil.Process()
        direct_child_processes = len(process.children(recursive=False))
        all_child_processes = len(process.children(recursive=True))
        grandchild_processes = all_child_processes - direct_child_processes

        # Determine the last time we talked to the master (if ever)
        contacted = config.master_contacted(update=False)
        if isinstance(contacted, datetime):  # pragma: no cover
            contacted = datetime.utcnow() - contacted

        # Determine the last time we announced ourselves to the
        # master (if ever)
        last_announce = config.get("last_announce", None)
        if isinstance(last_announce, datetime):  # pragma: no cover
            last_announce = datetime.utcnow() - last_announce

        data = {"state": config["state"],
                "agent_hostname": config["agent_hostname"],
                "free_ram": memory.free_ram(),
                "agent_process_ram": memory.process_memory(),
                "consumed_ram": memory.total_consumption(),
                "child_processes": direct_child_processes,
                "grandchild_processes": grandchild_processes,
                "pids": config["pids"],
                "agent_id": config["agent_id"],
                "last_master_contact": contacted,
                "last_announce": last_announce,
                "agent_lock_file": config["agent_lock_file"],
                "uptime": total_seconds(
                    timedelta(seconds=time.time() - config["start"])),
                "jobs": list(config["jobtypes"].keys())}

        if config["farm_name"]:
            data["farm_name"] = config["farm_name"]

        return dumps(data)
Esempio n. 5
0
def fake_work():
    parser = argparse.ArgumentParser(
        description="Quick and dirty script to create a job type, a job, and "
                    "some tasks which are then posted directly to the "
                    "agent.  The primary purpose of this script is to test "
                    "the internal of the job types")
    parser.add_argument(
        "--master-api", default="http://127.0.0.1/api/v1",
        help="The url to the master's api [default: %(default)s]")
    parser.add_argument(
        "--agent-api", default="http://127.0.0.1:50000/api/v1",
        help="The url to the agent's api [default: %(default)s]")
    parser.add_argument(
        "--jobtype", default="FakeRender",
        help="The job type to use [default: %(default)s]")
    parser.add_argument(
        "--job", type=int,
        help="If provided then this will be the job we pull tasks from "
             "and assign to the agent.  Please note we'll only be pulling "
             "tasks that aren't running or assigned.")
    args = parser.parse_args()
    logger.info("Master args.master_api: %s", args.master_api)
    logger.info("Agent args.master_api: %s", args.agent_api)
    assert not args.agent_api.endswith("/")
    assert not args.master_api.endswith("/")

    # session to make requests with
    session = requests.Session()
    session.headers.update({"content-type": "application/json"})

    existing_jobtype = session.get(
        args.master_api + "/jobtypes/%s" % args.jobtype)

    # Create a FakeRender job type if one does not exist
    if not existing_jobtype.ok:
        sourcecode = dedent("""
        from pyfarm.jobtypes.examples import %s as _%s
        class %s(_%s):
            pass""" % (args.jobtype, args.jobtype, args.jobtype, args.jobtype))
        response = session.post(
            args.master_api + "/jobtypes/",
            data=dumps({
                "name": args.jobtype,
                "classname": args.jobtype,
                "code": sourcecode,
                "max_batch": 1}))
        assert response.ok, response.json()
        jobtype_data = response.json()
        logger.info(
            "Created job type %r, id %r", args.jobtype, jobtype_data["id"])

    else:
        jobtype_data = existing_jobtype.json()
        logger.info(
            "Job type %r already exists, id %r",
            args.jobtype, jobtype_data["id"])

    jobtype_version = jobtype_data["version"]

    if args.job is None:
        job = session.post(
            args.master_api + "/jobs/",
            data=dumps({
                "start": 1,
                "end": 1,
                "title": "Fake Job - %s" % int(time.time()),
                "jobtype": args.jobtype}))
        assert job.ok, job.json()
        job = job.json()
        logger.info("Job %r created", job["id"])
    else:
        job = session.get(args.master_api + "/jobs/%s" % args.job)
        if not job.ok:
            logger.error("No such job with id %r", args.job)
            return
        else:
            job = job.json()
            logger.info("Job %r exists", job["id"])

    tasks = session.get(args.master_api + "/jobs/%s/tasks/" % job["id"])
    assert tasks.ok

    job_tasks = []
    for task in tasks.json():
        if task["state"] not in ("queued", "failed"):
            logger.info(
                "Can't use task %s, it's state is not 'queued' or 'failed'",
                task["id"])
            continue

        if task["agent_id"] is not None:
            logger.info(
                "Can't use task %s, it already has an agent assigned",
                task["id"])

        job_tasks.append({"id": task["id"], "frame": task["frame"]})

    if not job_tasks:
        logger.error("Could not find any tasks to send for job %s", job["id"])
        return

    logger.info(
        "Found %s tasks from job %s to assign to %r",
        len(job_tasks), job["id"], args.agent_api)

    assignment_data = {
        "job": {
            "id": job["id"],
            "by": job["by"],
            "ram": job["ram"],
            "ram_warning": job["ram_warning"],
            "ram_max": job["ram_max"],
            "cpus": job["cpus"],
            "batch": job["batch"],
            "user": job["user"],
            "data": job["data"],
            "environ": job["environ"],
            "title": job["title"]},
        "jobtype": {
            "name": args.jobtype,
            "version": jobtype_version},
        "tasks": job_tasks}

    # Drop any keys which don't have values since this
    # would break the schema validation in the agent.
    for key in list(assignment_data["job"]):
        if assignment_data["job"][key] is None:
            del assignment_data["job"][key]

    response = session.post(
        args.agent_api + "/assign",
        data=dumps(assignment_data))
    assert response.ok, response.json()
    logger.info("Tasks posted to agent")
Esempio n. 6
0
 def test_dumps_uuid(self):
     data = {"uuid": uuid4()}
     self.assertEqual(dumps(data), dumps({"uuid": str(data["uuid"])}))
Esempio n. 7
0
 def test_dumps_decimal(self):
     config["agent_pretty_json"] = False
     data = {"decimal": Decimal("1.2")}
     self.assertEqual(
         dumps(data), dumps_(data, default=default_json_encoder))
Esempio n. 8
0
 def test_dumps_datetime(self):
     config["agent_pretty_json"] = False
     data = {"datetime": datetime.utcnow()}
     self.assertEqual(
         dumps(data), dumps_(data, default=default_json_encoder))
Esempio n. 9
0
 def test_dumps_single_argument(self):
     config["agent_pretty_json"] = False
     data = self.data.keys()[0]
     self.assertEqual(dumps(data), dumps_(data))
Esempio n. 10
0
 def test_dumps_not_pretty(self):
     config["agent_pretty_json"] = False
     self.assertEqual(dumps(self.data), dumps_(self.data))
Esempio n. 11
0
 def test_dumps_pretty(self):
     config["agent_pretty_json"] = True
     self.assertEqual(dumps(self.data), dumps_(self.data, indent=2))
Esempio n. 12
0
    def post(self, **kwargs):
        if request_from_master(kwargs["request"]):
            config.master_contacted()

        request = kwargs["request"]
        request_data = kwargs["data"]

        # First, get the resources we have *right now*.  In some cases
        # this means using the functions in pyfarm.core.sysinfo because
        # entries in `config` could be slightly out of sync with the system.
        memory_free = free_ram()
        cpus = config["agent_cpus"]
        requires_ram = request_data["job"].get("ram")
        requires_cpus = request_data["job"].get("cpus")

        if ("agent_id" in request_data and
            request_data["agent_id"] != config["agent_id"]):
            logger.error("Wrong agent_id in assignment: %s. Our id is %s",
                         request_data["agent_id"], config["agent_id"])
            return (
                dumps({"error": "You have the wrong agent. "
                                "I am %s." % config["agent_id"],
                       "agent_id": config["agent_id"]}),
                BAD_REQUEST
            )

        elif self.agent.reannounce_lock.locked:
            logger.warning("Temporarily rejecting assignment because we "
                           "are in the middle of a reannounce.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "reannounce in progress. Try again shortly."}),
                SERVICE_UNAVAILABLE
            )

        elif self.agent.shutting_down:
            logger.error("Rejecting assignment because the agent is in the "
                         "process of shutting down.")
            return (
                dumps({"error": "Agent cannot accept assignments because it is "
                                "shutting down."}),
                SERVICE_UNAVAILABLE
            )

        elif "restart_requested" in config \
                and config["restart_requested"] is True:
            logger.error("Rejecting assignment because of scheduled restart.")
            return (
                dumps({"error": "Agent cannot accept assignments because of a "
                                "pending restart."}),
                SERVICE_UNAVAILABLE
            )

        elif "agent_id" not in config:
            logger.error(
                "Agent has not yet connected to the master or `agent_id` "
                "has not been set yet.")
            return (
                dumps({"error": "agent_id has not been set in the config"}),
                SERVICE_UNAVAILABLE
            )

        # Do we have enough ram?
        elif requires_ram is not None and requires_ram > memory_free:
            logger.error(
                "Task %s requires %sMB of ram, this agent has %sMB free.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_ram, memory_free,
                request_data["job"]["id"])
            config["free_ram"] = memory_free
            return (
                dumps({"error": "Not enough ram",
                       "agent_ram": memory_free,
                       "requires_ram": requires_ram}),
                BAD_REQUEST
            )

        # Do we have enough cpus (count wise)?
        elif requires_cpus is not None and requires_cpus > cpus:
            logger.error(
                "Task %s requires %s CPUs, this agent has %s CPUs.  "
                "Rejecting Task %s.",
                request_data["job"]["id"], requires_cpus, cpus,
                request_data["job"]["id"])
            return (
                dumps({"error": "Not enough cpus",
                       "agent_cpus": cpus,
                       "requires_cpus": requires_cpus}),
                BAD_REQUEST
            )

        new_task_ids = set(task["id"] for task in request_data["tasks"])

        for assignment in config["current_assignments"].itervalues():
            existing_task_ids = set(x["id"] for x in assignment["tasks"])

            # If the assignment is identical to one we already have
            if existing_task_ids == new_task_ids:
                logger.debug(
                    "Ignoring repeated assignment of the same batch")
                return dumps({"id": assignment["id"]}), ACCEPTED

            # If there is only a partial overlap
            elif existing_task_ids & new_task_ids:
                logger.error("Rejecting assignment with partial overlap with "
                             "existing assignment.")
                unknown_task_ids = new_task_ids - existing_task_ids
                return (
                    dumps({"error": "Partial overlap of tasks",
                           "rejected_task_ids": list(unknown_task_ids)}),
                    CONFLICT
                )

        if not config["agent_allow_sharing"]:
            for jobtype in config["jobtypes"].itervalues():
                num_finished_tasks = (len(jobtype.finished_tasks) +
                                      len(jobtype.failed_tasks))
                if len(jobtype.assignment["tasks"]) > num_finished_tasks:
                    logger.error("Rejecting an assignment that would require "
                                 "agent sharing")
                    return (
                        dumps({
                            "error": "Agent does not allow multiple "
                                     "assignments",
                            "rejected_task_ids": list(new_task_ids)}),
                        CONFLICT
                    )

        assignment_uuid = uuid4()
        request_data.update(id=assignment_uuid)
        config["current_assignments"][assignment_uuid] = request_data
        logger.debug("Accepted assignment %s: %r",
                     assignment_uuid, request_data)
        logger.info("Accept assignment from job %s with %s tasks",
                    request_data["job"]["title"], len(request_data["tasks"]))

        def assignment_failed(result, assign_id):
            logger.error(
                "Assignment %s failed, result: %r, removing.", assign_id, result)
            logger.error(result.getTraceback())
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            # Do not mark the assignment as failed if the reason for failing
            # was that we ran out of disk space
            failed = not isinstance(result.value, InsufficientSpaceError)
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    instance = config["jobtypes"].pop(jobtype_id, None)
                    instance.stop(
                        assignment_failed=failed,
                        avoid_reassignment=not failed,
                        error="Error in jobtype: %r. "
                              "Traceback: %s" % (result,
                                                 traceback.format_exc()))

        def assignment_started(_, assign_id):
            logger.debug("Assignment %s has started", assign_id)
            config["state"] = AgentState.RUNNING
            self.agent.reannounce(force=True)

        def remove_assignment(_, assign_id):
            assignment = config["current_assignments"].pop(assign_id)
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def assignment_stopped(_, assign_id):
            logger.debug("Assignment %s has stopped", assign_id)
            if (len(config["current_assignments"]) <= 1 and
                not self.agent.shutting_down):
                config["state"] = AgentState.ONLINE
                self.agent.reannounce(force=True)
            assignment = config["current_assignments"][assign_id]
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    jobtype = config["jobtypes"].pop(jobtype_id, None)
                    updates_deferred = DeferredList(
                        jobtype.task_update_deferreds)
                    updates_deferred.addBoth(remove_assignment, assign_id)
            else:
                config["current_assignments"].pop(assign_id)

        def restart_if_necessary(_):  # pragma: no cover
            if "restart_requested" in config and config["restart_requested"]:
                stopping = config["agent"].stop()
                stopping.addCallbacks(lambda _: reactor.stop(),
                                      lambda _: reactor.stop())

        def load_jobtype_failed(result, assign_id):
            logger.error(
                "Loading jobtype for assignment %s failed, removing.", assign_id)
            traceback = result.getTraceback()
            logger.debug("Got traceback")
            logger.error(traceback)
            assignment = config["current_assignments"].pop(assign_id)

            # Mark all tasks as failed on master and set an error message
            logger.debug("Marking tasks in assignment as failed")
            def post_update(post_url, post_data, task, delay=0):
                post_func = partial(post, post_url, data=post_data,
                    callback=lambda x: result_callback(
                        post_url, post_data, task, x),
                    errback=lambda x: error_callback(
                        post_url, post_data, task, x))
                reactor.callLater(delay, post_func)

            def result_callback(cburl, cbdata, task, response):
                if 500 <= response.code < 600:
                    logger.error(
                        "Error while marking task %s as failed on master, "
                        "retrying", task["id"])
                    post_update(cburl, cbdata, task, delay=http_retry_delay())

                elif response.code != OK:
                    logger.error(
                        "Could not mark task %s as failed, server response "
                        "code was %s", task["id"], response.code)

                else:
                    logger.info(
                        "Marked task %s as failed on master", task["id"])

            def error_callback(cburl, cbdata, task, failure_reason):
                logger.error(
                    "Error while marking task %s as failed, retrying",
                    task["id"], failure_reason)
                post_update(cburl, cbdata, task, delay=http_retry_delay())

            for task in assignment["tasks"]:
                url = "%s/jobs/%s/tasks/%s" % (
                    config["master_api"], assignment["job"]["id"], task["id"])
                data = {
                    "state": WorkState.FAILED,
                    "last_error": traceback}
                post_update(url, data, task)

            # If the loading was partially successful for some reason, there
            # might already be an entry for this jobtype in the config.
            # Remove it if it exists.
            if "jobtype" in assignment:
                jobtype_id = assignment["jobtype"].pop("id", None)
                if jobtype_id:
                    config["jobtypes"].pop(jobtype_id, None)

        def loaded_jobtype(jobtype_class, assign_id):
            # TODO: report error to master
            if hasattr(jobtype_class, "getTraceback"):
                logger.error(jobtype_class.getTraceback())
                return

            # TODO: add call to prepare_for_job
            # TODO: add call to spawn_persistent_process

            # Instance the job type and pass in the assignment data.
            instance = jobtype_class(request_data)

            if not isinstance(instance, JobType):
                raise TypeError(
                    "Expected a subclass of "
                    "pyfarm.jobtypes.core.jobtype.JobType")

            # TODO: add callback to cleanup_after_job
            # TODO: add callback to stop persistent process
            try:
                started_deferred, stopped_deferred = instance._start()
                started_deferred.addCallback(assignment_started, assign_id)
                started_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addCallback(assignment_stopped, assign_id)
                stopped_deferred.addErrback(assignment_failed, assign_id)
                stopped_deferred.addBoth(restart_if_necessary)
                stopped_deferred.addBoth(
                    lambda *args: instance._remove_tempdirs())
                stopped_deferred.addBoth(
                    lambda *args: instance._close_logs())
                stopped_deferred.addBoth(
                    lambda *args: instance._upload_logfile())
            except Exception as e:
                logger.error("Error on starting jobtype, stopping it now.  "
                             "Error was: %r. Traceback: %s", e,
                             traceback.format_exc())
                instance.stop(assignment_failed=True,
                              error="Error while loading jobtype: %r. "
                                    "Traceback: %s" %
                                    (e, traceback.format_exc()))
                assignment = config["current_assignments"].pop(assign_id)
                if "jobtype" in assignment:
                    jobtype_id = assignment["jobtype"].pop("id", None)
                    if jobtype_id:
                        config["jobtypes"].pop(jobtype_id, None)

        # Load the job type then pass the class along to the
        # callback.  No errback here because all the errors
        # are handled internally in this case.
        jobtype_loader = JobType.load(request_data)
        jobtype_loader.addCallback(loaded_jobtype, assignment_uuid)
        jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid)

        return dumps({"id": assignment_uuid}), ACCEPTED