Example #1
0
class BuildDispatcher(multiprocessing.Process):
    """
    1) Fetch build tasks from frontend
    2) Loop through them and try to allocate VM for each
       - If VM can be allocated, spawn a worker and run it asynchronously
       - otherwise, check the next build task
    3) Go to 1
    """
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher",
                                    "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)
        self.workers = []
        self.next_worker_id = 1

        self.arch_to_groups = defaultdict(list)
        # PC => max N builders per user
        self.group_to_usermax = dict()
        self.job_ids_previous_request = set()
        self.init_internal_structures()

    def get_vm_group_ids(self, arch):
        if not arch:
            return [group["id"] for group in self.opts.build_groups]
        try:
            return self.arch_to_groups[arch]
        except KeyError:
            raise DispatchBuildError("Unknown architecture {0}".format(arch))

    def update_process_title(self, msg=None):
        proc_title = "Build dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def init_internal_structures(self):
        for group in self.opts.build_groups:
            group_id = group["id"]

            for arch in group["archs"]:
                self.arch_to_groups[arch].append(group_id)
                self.log.debug("mapping %s to %s group", arch, group_id)

            self.log.debug("user might use only %sVMs for %s group",
                           group["max_vm_per_user"], group_id)
            self.group_to_usermax[group_id] = group["max_vm_per_user"]

    def load_jobs(self):
        """
        Retrieve a single build job from frontend.
        """
        self.log.info("Waiting for a job from frontend...")
        get_task_init_time = time.time()
        tasks = None

        while not tasks:
            self.update_process_title(
                "Waiting for jobs from frontend for {} s".format(
                    int(time.time() - get_task_init_time)))
            try:
                tasks = self.frontend_client.get('pending-jobs').json()
            except (FrontendClientException, ValueError) as error:
                self.log.exception(
                    "Retrieving build jobs from %s failed with error: %s",
                    self.opts.frontend_base_url, error)
            finally:
                if not tasks:
                    time.sleep(self.opts.sleeptime)

        job_ids = {task.get("task_id") for task in tasks if task}
        new_ids = job_ids - self.job_ids_previous_request
        if new_ids:
            self.log.info("Got new build jobs: %s", new_ids)
        self.job_ids_previous_request = job_ids

        return [BuildJob(task, self.opts) for task in tasks if task]

    def can_build_start(self, job):
        """
        Announce to the frontend that the build is starting. Frontend
        may reject build to start.

        Returns
        -------
        True if the build can start
        False if the build can not start (build is cancelled)
        """
        try:
            job.started_on = time.time()
            job.status = BuildStatus.STARTING
            can_build_start = self.frontend_client.starting_build(
                job.to_dict())
        except (FrontendClientException, ValueError) as error:
            self.log.exception(
                "Communication with Frontend to confirm build start failed with error: %s",
                error)
            return False

        if not can_build_start:
            self.log.exception("Frontend forbade to start the job %s",
                               job.task_id)

        return can_build_start

    def clean_finished_workers(self):
        for worker in self.workers:
            if not worker.is_alive():
                worker.join(5)
                self.workers.remove(worker)
                self.log.info("Removed finished worker %s for job %s",
                              worker.worker_id, worker.job.task_id)

    def start_worker(self, vm, job, reattach=False):
        worker = Worker(opts=self.opts,
                        vm_manager=self.vm_manager,
                        worker_id=self.next_worker_id,
                        vm=vm,
                        job=job,
                        reattach=reattach)
        self.workers.append(worker)
        self.next_worker_id = (self.next_worker_id + 1) % 2**15

        worker.start()
        return worker

    def run(self):
        """
        Executes build dispatching process.
        """
        self.log.info("Build dispatching started.")
        self.update_process_title()

        first_backend_loop = True

        while True:
            self.clean_finished_workers()

            skip_jobs_cache = {}

            for job in self.load_jobs():
                # first check if we do not have
                # worker already running for the job
                if any([job.task_id == w.job.task_id for w in self.workers]):
                    self.log.debug("Skipping already running task '%s'",
                                   job.task_id)
                    continue

                if first_backend_loop:
                    # Server was restarted.  Some builds might be running on
                    # background on builders;  so search db builder records for
                    # the job and if we found it, spawn a worker to reattach.
                    vm = self.vm_manager.get_vm_by_task_id(job.task_id)
                    if vm and vm.state == 'in_use':
                        self.log.info("Reattaching to VM: " + str(vm))
                        worker = self.start_worker(vm, job, reattach=True)
                        vm.store_field(self.vm_manager.rc, "used_by_worker",
                                       worker.worker_id)
                        self.log.info("Reattached new worker %s for job %s",
                                      worker.worker_id, worker.job.task_id)
                        continue

                cache_entry = '{owner}-{arch}-{sandbox}'.format(
                    owner=job.project_owner,
                    arch=job.arch or "noarch",
                    sandbox=job.sandbox,
                )

                if cache_entry in skip_jobs_cache:
                    self.log.debug("Skipped job %s, cached", job)
                    continue

                # ... and if the task is new to us,
                # allocate new vm and run full build
                try:
                    vm_group_ids = self.get_vm_group_ids(job.arch)
                    self.log.debug("Picking VM from groups %s for job %s",
                                   vm_group_ids, job)
                    vm = self.vm_manager.acquire_vm(vm_group_ids,
                                                    job.project_owner,
                                                    job.sandbox,
                                                    self.next_worker_id,
                                                    job.task_id, job.build_id,
                                                    job.chroot)
                except NoVmAvailable as error:
                    skip_jobs_cache[cache_entry] = True
                    self.log.debug(
                        "No available resources for task %s (Reason: %s). Deferring job.",
                        job.task_id, error)
                    continue
                else:
                    self.log.info("VM %s for job %s successfully acquired",
                                  vm.vm_name, job.task_id)

                if not self.can_build_start(job):
                    self.vm_manager.release_vm(vm.vm_name)
                    continue

                worker = self.start_worker(vm, job)
                self.log.info("Started new worker %s for job %s",
                              worker.worker_id, worker.job.task_id)

            first_backend_loop = False
            time.sleep(self.opts.sleeptime)
Example #2
0
class ActionDispatcher(multiprocessing.Process):
    """
    1) Fetch action task from frontend
    2) Run it synchronously
    3) Go to 1)
    """

    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="action-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.action_dispatcher", "action_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)

    def update_process_title(self, msg=None):
        proc_title = "Action dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def get_frontend_actions(self):
        """
        Get unfiltered list of actions from frontend, both running and pending.
        """

        try:
            raw_actions = self.frontend_client.get('pending-actions').json()
        except (FrontendClientException, ValueError) as error:
            self.log.exception(
                "Retrieving an action tasks failed with error: %s",
                error)
            return []

        return [ActionQueueTask(action['id']) for action in raw_actions]


    def run(self):
        """
        Executes action dispatching process.
        """
        self.log.info("Action dispatching started.")
        self.update_process_title()

        redis = get_redis_connection(self.opts)
        worker_manager = ActionWorkerManager(
            redis_connection=redis,
            log=self.log,
            max_workers=self.opts.actions_max_workers)
        worker_manager.frontend_client = FrontendClient(self.opts, self.log)

        timeout = self.opts.sleeptime

        while True:
            self.log.info("getting actions from frontend")
            start = time.time()
            for task in self.get_frontend_actions():
                worker_manager.add_task(task)

            # Execute the actions.
            worker_manager.run(timeout=timeout)

            sleep_more = timeout - (time.time() - start)
            if sleep_more > 0:
                time.sleep(sleep_more)