def __init__(self, opts, cmdline_opts=None): self.opts = opts self.prune_days = getattr(self.opts, "prune_days", DEF_DAYS) self.chroots = {} self.frontend_client = FrontendClient(self.opts) self.mtime_optimization = True if cmdline_opts: self.mtime_optimization = not cmdline_opts.no_mtime_optimization
def __init__(self, config_file=None, ext_opts=None): if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to show our cli options for read_conf() self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.log = get_redis_logger(self.opts, "backend.main", "backend") self.frontend_client = FrontendClient(self.opts, self.log)
def setup_method(self, method): self.opts = Munch( frontend_base_url="http://example.com/", frontend_auth="12345678", ) self.fc = FrontendClient(self.opts) self.data = { "foo": "bar", "bar": [1, 3, 5], } self.url_path = "sub_path" self.build_id = 12345 self.chroot_name = "fedora-20-x86_64"
def __init__(self, opts): multiprocessing.Process.__init__(self, name="build-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) self.vm_manager = VmManager(self.opts) # Maps e.g. x86_64 && i386 => PC self.arch_to_group = dict() # PC => max N builders per user self.group_to_usermax = dict() self.init_internal_structures()
def __init__(self, opts): multiprocessing.Process.__init__(self, name="build-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) self.vm_manager = VmManager(self.opts) self.workers = [] self.next_worker_id = 1 self.arch_to_groups = defaultdict(list) # PC => max N builders per user self.group_to_usermax = dict() self.init_internal_structures()
def run(self): """ Executes action dispatching process. """ self.log.info("Action dispatching started.") self.update_process_title() redis = get_redis_connection(self.opts) worker_manager = ActionWorkerManager( redis_connection=redis, log=self.log, max_workers=self.opts.actions_max_workers) worker_manager.frontend_client = FrontendClient(self.opts, self.log) timeout = self.opts.sleeptime while True: self.log.info("getting actions from frontend") start = time.time() for task in self.get_frontend_actions(): worker_manager.add_task(task) # Execute the actions. worker_manager.run(timeout=timeout) sleep_more = timeout - (time.time() - start) if sleep_more > 0: time.sleep(sleep_more)
def __init__(self, opts): multiprocessing.Process.__init__(self, name="action-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.action_dispatcher", "action_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log)
class CoprBackend(object): """ Core process - starts/stops dispatchers for actions and builds :param config_file: path to the backend configuration file :param ext_opts: additional options for backend """ def __init__(self, config_file=None, ext_opts=None): if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to show our cli options for read_conf() self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.log = get_redis_logger(self.opts, "backend.main", "backend") self.frontend_client = FrontendClient(self.opts, self.log) def update_conf(self): """ Update backend config from config file """ self.opts = self.config_reader.read() def run(self): """ Starts backend process. Control sub process start/stop. """ self.update_conf() self.log.info("Initial config: {}".format(self.opts)) try: self.log.info("Rescheduling old unfinished builds") self.frontend_client.reschedule_all_running(120) # 10 minutes except RequestException as err: self.log.exception(err) raise CoprBackendError(err) build_dispatcher = BuildDispatcher(self.opts) action_dispatcher = ActionDispatcher(self.opts) build_dispatcher.start() action_dispatcher.start()
def __init__(self, opts): """ base class initialization """ self.opts = opts # Maps e.g. x86_64 && i386 => PC (. self.arch_to_group_id_map = dict() # PC => max N builders per user self.group_to_usermax = dict() # task_id -> task dict self.added_jobs_dict = dict() self.rc = None self.channel = None self.ps_thread = None self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab") self.jg_control = jobgrabcontrol.Channel(self.opts, self.log) self.frontend_client = FrontendClient(self.opts, self.log)
class CoprBackend(object): """ COPR backend head process. :param config_file: path to the backend configuration file :param ext_opts: additional options for backend """ def __init__(self, config_file=None, ext_opts=None): if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to show our cli options for read_conf() self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.log = get_redis_logger(self.opts, "backend.main", "backend") self.frontend_client = FrontendClient(self.opts, self.log) def update_conf(self): """ Update backend config from config file """ self.opts = self.config_reader.read() def run(self): """ Starts backend process. Control sub process start/stop. """ self.update_conf() self.log.info("Initial config: %s", self.opts) try: self.log.info("Rescheduling old unfinished builds") self.frontend_client.reschedule_all_running() except FrontendClientException as err: self.log.exception(err) raise CoprBackendError(err)
def __init__(self, config_file=None, ext_opts=None): # read in config file # put all the config items into a single self.opts munch if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to stow our cli options for read_conf() self.workers_by_group_id = defaultdict(list) self.max_worker_num_by_group_id = defaultdict(int) self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.task_queues = {} self.frontend_client = FrontendClient(self.opts) self.is_running = False self.log = get_redis_logger(self.opts, "backend.main", "backend")
class CoprJobGrab(object): """ Fetch jobs from the Frontend - submit build task to the jobs queue for workers - run Action handler for action tasks :param Munch opts: backend config :param lock: :py:class:`multiprocessing.Lock` global backend lock TODO: Not yet fully ready for config reload. """ def __init__(self, opts): """ base class initialization """ self.opts = opts # Maps e.g. x86_64 && i386 => PC (. self.arch_to_group_id_map = dict() # PC => max N builders per user self.group_to_usermax = dict() # task_id -> task dict self.added_jobs_dict = dict() self.rc = None self.channel = None self.ps_thread = None self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab") self.jg_control = jobgrabcontrol.Channel(self.opts, self.log) self.frontend_client = FrontendClient(self.opts, self.log) def group(self, arch): try: return self.arch_to_group_id_map[arch] except KeyError: raise CoprJobGrabError("Unknown architecture {0}".format(arch)) def listen_to_pubsub(self): """ Listens for job reschedule queries. Spawns self.ps_thread, don't forget to stop it. """ self.rc = get_redis_connection(self.opts) self.channel = self.rc.pubsub(ignore_subscribe_messages=True) self.channel.subscribe(**{JOB_GRAB_TASK_END_PUBSUB: self.on_pubsub_event}) self.ps_thread = self.channel.run_in_thread(sleep_time=0.05) self.log.info("Subscribed to {} channel".format(JOB_GRAB_TASK_END_PUBSUB)) def route_build_task(self, task): """ Route build task to the appropriate queue. :param task: dict-like object which represent build task Utilized **task** keys: - ``task_id`` - ``chroot`` - ``arch`` :return int: Count of the successfully routed tasks """ count = 0 if "task_id" in task: if task["task_id"] not in self.added_jobs_dict: arch = task["chroot"].split("-")[2] group = self.group(arch) username = task["project_owner"] active_jobs_count = len([t for t_id, t in self.added_jobs_dict.items() if t["project_owner"] == username]) if active_jobs_count > self.group_to_usermax[group]: self.log.debug("User can not acquire more VM (active builds #{0}), " "don't schedule more tasks".format(active_jobs_count)) return 0 msg = "enqueue task for user {0}: id={1}, arch={2}, group={3}, active={4}" self.log.debug(msg.format(username, task["task_id"], arch, group, active_jobs_count)) # Add both to local list and control channel queue. self.added_jobs_dict[task["task_id"]] = task self.jg_control.add_build(group, task) count += 1 else: self.log.info("Task missing field `task_id`, raw task: {}".format(task)) return count def process_action(self, action): """ Run action task handler, see :py:class:`~backend.action.Action` :param action: dict-like object with action task """ ao = Action(self.opts, action, frontend_client=self.frontend_client) ao.run() def load_tasks(self): """ Retrieve tasks from frontend and runs appropriate handlers """ try: r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url), auth=("user", self.opts.frontend_auth)) except RequestException as e: self.log.exception("Error retrieving jobs from {}: {}" .format(self.opts.frontend_base_url, e)) return try: r_json = r.json() except ValueError as e: self.log.exception("Error getting JSON build list from FE {0}".format(e)) return if r_json.get("builds"): self.log.debug("{0} jobs returned".format(len(r_json["builds"]))) count = 0 for task in r_json["builds"]: try: count += self.route_build_task(task) except CoprJobGrabError as err: self.log.exception("Failed to enqueue new job: {} with error: {}".format(task, err)) if count: self.log.info("New build jobs: %s" % count) if r_json.get("actions"): count = 0 self.log.info("{0} actions returned".format(len(r_json["actions"]))) for action in r_json["actions"]: start = time.time() try: self.process_action(action) except Exception as error: self.log.exception("Error during processing action `{}`: {}".format(action, error)) if time.time() - start > 2*self.opts.sleeptime: # we are processing actions for too long, stop and fetch everything again (including new builds) break def on_pubsub_event(self, raw): # from celery.contrib import rdb; rdb.set_trace() if raw is None: return if "type" not in raw or raw["type"] != "message": self.log.warn("Missing type or wrong type in pubsub msg: {}, ignored".format(raw)) return try: msg = json.loads(raw["data"]) # msg: {"action": ("remove"|"reschedule"), "task_id": ..., "build_id"..., "chroot": ...} # Actions: "remove" simply remove `task_id` from self.added_job # "reschedule" additionally call frontend and set pending state before removal if "action" not in msg: self.log.warn("Missing required field `action`, msg ignored: {}".format(msg)) return action = msg["action"] if action not in ["remove", "reschedule"]: self.log.warn("Action `{}` not allowed, msg ignored: {} ".format(action, msg)) return if "task_id" not in msg: self.log.warn("Missing required field `task_id`, msg ignored: {}".format(msg)) return task_id = msg["task_id"] if action == "reschedule" and "build_id" in msg and "chroot" in msg: # TODO: dirty dependency to frontend, Job management should be re-done ( self.log.info("Rescheduling task `{}`".format(task_id)) self.frontend_client.reschedule_build(msg["build_id"], msg["chroot"]) if task_id not in self.added_jobs_dict: self.log.debug("Task `{}` not present in added jobs, msg ignored: {}".format(task_id, msg)) return if action in ["remove", "reschedule"]: self.added_jobs_dict.pop(task_id) self.log.info("Removed task `{}` from added_jobs".format(task_id)) except Exception as err: self.log.exception("Error receiving message from remove pubsub: raw msg: {}, error: {}" .format(raw, err)) def log_queue_info(self): if self.added_jobs_dict: self.log.debug("Added jobs after remove and load: {}".format(self.added_jobs_dict)) self.log.debug("# of executed jobs: {}".format(len(self.added_jobs_dict))) def init_internal_structures(self): self.arch_to_group_id_map = dict() self.group_to_usermax = dict() for group in self.opts.build_groups: group_id = group["id"] for arch in group["archs"]: self.arch_to_group_id_map[arch] = group_id self.log.debug("mapping {0} to {1} group".format(arch, group_id)) self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id)) self.group_to_usermax[group_id] = group["max_vm_per_user"] self.added_jobs_dict = dict() def handle_control_channel(self): if not self.jg_control.backend_started(): return self.log.info("backend gave us signal to start") self.init_internal_structures() self.jg_control.remove_all_builds() self.jg_control.job_graber_initialized() def run(self): """ Starts job grabber process """ setproctitle("CoprJobGrab") self.listen_to_pubsub() self.log.info("JobGrub started.") self.init_internal_structures() try: while True: try: # This effectively delays job_grabbing until backend # gives as signal to start. self.handle_control_channel() self.load_tasks() self.log_queue_info() time.sleep(self.opts.sleeptime) except Exception as err: self.log.exception("Job Grab unhandled exception: {}".format(err)) except KeyboardInterrupt: return def terminate(self): if self.ps_thread: self.ps_thread.stop() self.ps_thread.join() super(CoprJobGrab, self).terminate()
class TestFrontendClient(object): def setup_method(self, method): self.opts = Munch( frontend_base_url="http://example.com/", frontend_auth="12345678", ) self.fc = FrontendClient(self.opts) self.data = { "foo": "bar", "bar": [1, 3, 5], } self.url_path = "sub_path" self.build_id = 12345 self.chroot_name = "fedora-20-x86_64" @pytest.fixture def mask_post_to_fe(self): self.ptf = MagicMock() self.fc._post_to_frontend = self.ptf def test_post_to_frontend(self, post_req): post_req.return_value.status_code = 200 self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_not_200(self, post_req): post_req.return_value.status_code = 501 with pytest.raises(RequestException): self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_post_error(self, post_req): post_req.side_effect = RequestException() with pytest.raises(RequestException): self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_repeated_first_try_ok(self, mask_post_to_fe, mc_time): response = "ok\n" self.ptf.return_value = response assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert not mc_time.sleep.called def test_post_to_frontend_repeated_second_try_ok(self, mask_post_to_fe, mc_time): response = "ok\n" self.ptf.side_effect = [ RequestException(), response, ] assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert mc_time.sleep.called def test_post_to_frontend_repeated_all_attempts_failed(self, mask_post_to_fe, mc_time): self.ptf.side_effect = RequestException() with pytest.raises(RequestException): self.fc._post_to_frontend_repeatedly(self.data, self.url_path) assert mc_time.sleep.called def test_update(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.update(self.data) assert ptfr.call_args == mock.call(self.data, "update") def test_starting_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr for val in [True, False]: ptfr.return_value.json.return_value = {"can_start": val} assert self.fc.starting_build(self.build_id, self.chroot_name) == val def test_starting_build_err(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr with pytest.raises(RequestException): self.fc.starting_build(self.build_id, self.chroot_name) def test_starting_build_err_2(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr ptfr.return_value.json.return_value = {} with pytest.raises(RequestException): self.fc.starting_build(self.build_id, self.chroot_name) def test_reschedule_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.reschedule_build(self.build_id, self.chroot_name) expected = mock.call({'build_id': self.build_id, 'chroot': self.chroot_name}, 'reschedule_build_chroot') assert ptfr.call_args == expected
class BuildDispatcher(multiprocessing.Process): """ 1) Fetch build task from frontend 2) Get a free VM for it 3) Create a worker for the job 4) Start it asynchronously and go to 1) """ def __init__(self, opts): multiprocessing.Process.__init__(self, name="build-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) self.vm_manager = VmManager(self.opts) # Maps e.g. x86_64 && i386 => PC self.arch_to_group = dict() # PC => max N builders per user self.group_to_usermax = dict() self.init_internal_structures() def get_vm_group_id(self, arch): try: return self.arch_to_group[arch] except KeyError: raise DispatchBuildError("Unknown architecture {0}".format(arch)) def update_process_title(self, msg=None): proc_title = "Build dispatcher" if msg: proc_title += " - " + msg setproctitle(proc_title) def init_internal_structures(self): self.arch_to_group = dict() self.group_to_usermax = dict() for group in self.opts.build_groups: group_id = group["id"] for arch in group["archs"]: self.arch_to_group[arch] = group_id self.log.debug("mapping {0} to {1} group".format(arch, group_id)) self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id)) self.group_to_usermax[group_id] = group["max_vm_per_user"] def load_job(self): """ Retrieve a single build job from frontend. """ self.log.info("Waiting for a job from frontend...") get_task_init_time = time.time() task = None while not task: self.update_process_title("Waiting for a job from frontend for {} s" .format(int(time.time() - get_task_init_time))) try: r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url), auth=("user", self.opts.frontend_auth)) task = r.json().get("build") except (RequestException, ValueError) as error: self.log.exception("Retrieving build job from {} failed with error: {}" .format(self.opts.frontend_base_url, error)) finally: if not task: time.sleep(self.opts.sleeptime) self.log.info("Got new build job {}".format(task['task_id'])) return BuildJob(task, self.opts) def acquire_vm_for_job(self, job, vm_group_id): return vm def can_build_start(self, job): """ Announce to the frontend that the build is going to start so that it can confirm that and draw out another job for building. Returns ------- True if the build can start False if the build can not start (build is cancelled) """ try: can_build_start = self.frontend_client.starting_build(job.build_id, job.chroot) except (RequestException, ValueError) as error: self.log.exception("Communication with Frontend to confirm build start failed with error: {}".format(error)) return False if not can_build_start: self.log.exception("Frontend forbade to start the job {}".format(self.job.task_id)) return can_build_start def join_finished_workers(self, workers): for worker in workers: if not worker.is_alive(): worker.join(5) workers.remove(worker) self.log.info("Removed finished worker {} for job {}" .format(worker.worker_id, worker.job.task_id)) def run(self): """ Executes build dispatching process. """ self.log.info("Build dispatching started.") self.update_process_title() workers = [] next_worker_id = 1 while True: self.join_finished_workers(workers) job = self.load_job() try: self.log.info("Acquiring VM for job {}...".format(str(job))) vm_group_id = self.get_vm_group_id(job.arch) vm = self.vm_manager.acquire_vm(vm_group_id, job.project_owner, os.getpid(), job.task_id, job.build_id, job.chroot) except NoVmAvailable as error: self.log.info("No available resources for task {} (Reason: {}). Deferring job." .format(job.task_id, error)) self.frontend_client.defer_build(job.build_id, job.chroot) continue else: self.log.info("VM {} for job {} successfully acquired".format(vm.vm_name, job.task_id)) if not self.can_build_start(job): self.vm_manager.release_vm(vm.vm_name) continue worker = Worker( opts=self.opts, frontend_client=self.frontend_client, vm_manager=self.vm_manager, worker_id=next_worker_id, vm=vm, job=job ) worker.start() workers.append(worker) self.log.info("Started new worker {} for job {}" .format(worker.worker_id, worker.job.task_id)) next_worker_id = (next_worker_id + 1) % 2**15
class CoprJobGrab(object): """ Fetch jobs from the Frontend - submit build task to the jobs queue for workers - run Action handler for action tasks :param Munch opts: backend config :param lock: :py:class:`multiprocessing.Lock` global backend lock TODO: Not yet fully ready for config reload. """ def __init__(self, opts): """ base class initialization """ self.opts = opts # Maps e.g. x86_64 && i386 => PC (. self.arch_to_group_id_map = dict() # PC => max N builders per user self.group_to_usermax = dict() # task_id -> task dict self.added_jobs_dict = dict() self.rc = None self.channel = None self.ps_thread = None self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab") self.jg_control = jobgrabcontrol.Channel(self.opts, self.log) self.frontend_client = FrontendClient(self.opts, self.log) def group(self, arch): try: return self.arch_to_group_id_map[arch] except KeyError: raise CoprJobGrabError("Unknown architecture {0}".format(arch)) def listen_to_pubsub(self): """ Listens for job reschedule queries. Spawns self.ps_thread, don't forget to stop it. """ self.rc = get_redis_connection(self.opts) self.channel = self.rc.pubsub(ignore_subscribe_messages=True) self.channel.subscribe( **{JOB_GRAB_TASK_END_PUBSUB: self.on_pubsub_event}) self.ps_thread = self.channel.run_in_thread(sleep_time=0.05) self.log.info( "Subscribed to {} channel".format(JOB_GRAB_TASK_END_PUBSUB)) def route_build_task(self, task): """ Route build task to the appropriate queue. :param task: dict-like object which represent build task Utilized **task** keys: - ``task_id`` - ``chroot`` - ``arch`` :return int: Count of the successfully routed tasks """ count = 0 if "task_id" in task: if task["task_id"] not in self.added_jobs_dict: arch = task["chroot"].split("-")[2] group = self.group(arch) username = task["project_owner"] active_jobs_count = len([ t for t_id, t in self.added_jobs_dict.items() if t["project_owner"] == username ]) if active_jobs_count > self.group_to_usermax[group]: self.log.debug( "User can not acquire more VM (active builds #{0}), " "don't schedule more tasks".format(active_jobs_count)) return 0 msg = "enqueue task for user {0}: id={1}, arch={2}, group={3}, active={4}" self.log.debug( msg.format(username, task["task_id"], arch, group, active_jobs_count)) # Add both to local list and control channel queue. self.added_jobs_dict[task["task_id"]] = task self.jg_control.add_build(group, task) count += 1 else: self.log.info( "Task missing field `task_id`, raw task: {}".format(task)) return count def process_action(self, action): """ Run action task handler, see :py:class:`~backend.action.Action` :param action: dict-like object with action task """ ao = Action(self.opts, action, frontend_client=self.frontend_client) ao.run() def load_tasks(self): """ Retrieve tasks from frontend and runs appropriate handlers """ try: r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url), auth=("user", self.opts.frontend_auth)) except RequestException as e: self.log.exception("Error retrieving jobs from {}: {}".format( self.opts.frontend_base_url, e)) return try: r_json = r.json() except ValueError as e: self.log.exception( "Error getting JSON build list from FE {0}".format(e)) return if r_json.get("builds"): self.log.debug("{0} jobs returned".format(len(r_json["builds"]))) count = 0 for task in r_json["builds"]: try: count += self.route_build_task(task) except CoprJobGrabError as err: self.log.exception( "Failed to enqueue new job: {} with error: {}".format( task, err)) if count: self.log.info("New build jobs: %s" % count) if r_json.get("actions"): count = 0 self.log.info("{0} actions returned".format(len( r_json["actions"]))) for action in r_json["actions"]: start = time.time() try: self.process_action(action) except Exception as error: self.log.exception( "Error during processing action `{}`: {}".format( action, error)) if time.time() - start > 2 * self.opts.sleeptime: # we are processing actions for too long, stop and fetch everything again (including new builds) break def on_pubsub_event(self, raw): # from celery.contrib import rdb; rdb.set_trace() if raw is None: return if "type" not in raw or raw["type"] != "message": self.log.warn( "Missing type or wrong type in pubsub msg: {}, ignored".format( raw)) return try: msg = json.loads(raw["data"]) # msg: {"action": ("remove"|"reschedule"), "task_id": ..., "build_id"..., "chroot": ...} # Actions: "remove" simply remove `task_id` from self.added_job # "reschedule" additionally call frontend and set pending state before removal if "action" not in msg: self.log.warn( "Missing required field `action`, msg ignored: {}".format( msg)) return action = msg["action"] if action not in ["remove", "reschedule"]: self.log.warn( "Action `{}` not allowed, msg ignored: {} ".format( action, msg)) return if "task_id" not in msg: self.log.warn( "Missing required field `task_id`, msg ignored: {}".format( msg)) return task_id = msg["task_id"] if action == "reschedule" and "build_id" in msg and "chroot" in msg: # TODO: dirty dependency to frontend, Job management should be re-done ( self.log.info("Rescheduling task `{}`".format(task_id)) self.frontend_client.reschedule_build(msg["build_id"], msg["chroot"]) if task_id not in self.added_jobs_dict: self.log.debug( "Task `{}` not present in added jobs, msg ignored: {}". format(task_id, msg)) return if action in ["remove", "reschedule"]: self.added_jobs_dict.pop(task_id) self.log.info( "Removed task `{}` from added_jobs".format(task_id)) except Exception as err: self.log.exception( "Error receiving message from remove pubsub: raw msg: {}, error: {}" .format(raw, err)) def log_queue_info(self): if self.added_jobs_dict: self.log.debug("Added jobs after remove and load: {}".format( self.added_jobs_dict)) self.log.debug("# of executed jobs: {}".format( len(self.added_jobs_dict))) def init_internal_structures(self): self.arch_to_group_id_map = dict() self.group_to_usermax = dict() for group in self.opts.build_groups: group_id = group["id"] for arch in group["archs"]: self.arch_to_group_id_map[arch] = group_id self.log.debug("mapping {0} to {1} group".format( arch, group_id)) self.log.debug("user might use only {0}VMs for {1} group".format( group["max_vm_per_user"], group_id)) self.group_to_usermax[group_id] = group["max_vm_per_user"] self.added_jobs_dict = dict() def handle_control_channel(self): if not self.jg_control.backend_started(): return self.log.info("backend gave us signal to start") self.init_internal_structures() self.jg_control.remove_all_builds() self.jg_control.job_graber_initialized() def run(self): """ Starts job grabber process """ setproctitle("CoprJobGrab") self.listen_to_pubsub() self.log.info("JobGrub started.") self.init_internal_structures() try: while True: try: # This effectively delays job_grabbing until backend # gives as signal to start. self.handle_control_channel() self.load_tasks() self.log_queue_info() time.sleep(self.opts.sleeptime) except Exception as err: self.log.exception( "Job Grab unhandled exception: {}".format(err)) except KeyboardInterrupt: return def terminate(self): if self.ps_thread: self.ps_thread.stop() self.ps_thread.join() super(CoprJobGrab, self).terminate()
def main(): opts = get_backend_opts() fc = FrontendClient(opts) grabber = CoprJobGrab(opts, frontend_client=fc) grabber.run()
class CoprBackend(object): """ Core process - starts/stops/initializes workers :param config_file: path to the backend configuration file :param ext_opts: additional options for backend """ def __init__(self, config_file=None, ext_opts=None): # read in config file # put all the config items into a single self.opts munch if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to stow our cli options for read_conf() self.workers_by_group_id = defaultdict(list) self.max_worker_num_by_group_id = defaultdict(int) self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.task_queues = {} self.frontend_client = FrontendClient(self.opts) self.is_running = False self.log = get_redis_logger(self.opts, "backend.main", "backend") def clean_task_queues(self): """ Make sure there is nothing in our task queues """ try: for queue in self.task_queues.values(): while queue.length: queue.dequeue() except ConnectionError: raise CoprBackendError( "Could not connect to a task queue. Is Redis running?") def init_task_queues(self): """ Connect to the retask.Queue for each group_id. Remove old tasks from queues. """ try: for group in self.opts.build_groups: group_id = group["id"] queue = Queue("copr-be-{0}".format(group_id)) queue.connect() self.task_queues[group_id] = queue except ConnectionError: raise CoprBackendError( "Could not connect to a task queue. Is Redis running?") self.clean_task_queues() def update_conf(self): """ Update backend config from config file """ self.opts = self.config_reader.read() def spin_up_workers_by_group(self, group): """ Handles starting/growing the number of workers :param dict group: Builders group Utilized keys: - **id** - **max_workers** """ group_id = group["id"] if len(self.workers_by_group_id[group_id]) < group["max_workers"]: self.log.info("Spinning up more workers") for _ in range(group["max_workers"] - len(self.workers_by_group_id[group_id])): self.max_worker_num_by_group_id[group_id] += 1 try: w = Worker( opts=self.opts, frontend_client=self.frontend_client, worker_num=self.max_worker_num_by_group_id[group_id], group_id=group_id) self.workers_by_group_id[group_id].append(w) w.start() time.sleep(0.3) self.log.info("Started worker: {} for group: {}".format( w.worker_num, group_id)) except Exception as error: self.log.exception( "Failed to start new Worker: {}".format(error)) self.log.info("Finished starting worker processes") def prune_dead_workers_by_group_id(self, group_id): """ Removes dead workers from the pool :return list: alive workers :raises: :py:class:`~backend.exceptions.CoprBackendError` when got dead worker and option "exit_on_worker" is enabled """ preserved_workers = [] for w in self.workers_by_group_id[group_id]: if not w.is_alive(): self.log.warn("Worker {} died unexpectedly".format( w.worker_num)) w.terminate() # kill it with a fire if self.opts.exit_on_worker: raise CoprBackendError("Worker died unexpectedly, exiting") else: preserved_workers.append(w) return preserved_workers def terminate(self): """ Cleanup backend processes (just workers for now) And also clean all task queues as they would survive copr restart """ self.is_running = False for group in self.opts.build_groups: group_id = group["id"] for w in self.workers_by_group_id[group_id][:]: self.workers_by_group_id[group_id].remove(w) w.terminate_instance() self.clean_task_queues() try: self.log.info("Rescheduling unfinished builds before stop") self.frontend_client.reschedule_all_running() except RequestException as err: self.log.exception(err) return def run(self): """ Starts backend process. Control sub process start/stop. """ self.update_conf() self.init_task_queues() time.sleep(1) self.log.info("Initial config: {}".format(self.opts)) try: self.log.info("Rescheduling old unfinished builds") self.frontend_client.reschedule_all_running() except RequestException as err: self.log.exception(err) return self.is_running = True while self.is_running: # re-read config into opts self.update_conf() for group in self.opts.build_groups: group_id = group["id"] self.spin_up_workers_by_group(group) # FIXME - prune out workers # if len(self.workers) > self.opts.num_workers: # killnum = len(self.workers) - self.opts.num_workers # for w in self.workers[:killnum]: # insert a poison pill? Kill after something? I dunno. # FIXME - if a worker bombs out - we need to check them # and startup a new one if it happens # check for dead workers and abort preserved_workers = self.prune_dead_workers_by_group_id( group_id) self.workers_by_group_id[group_id] = preserved_workers time.sleep(self.opts.sleeptime)
class ActionDispatcher(multiprocessing.Process): """ 1) Fetch action task from frontend 2) Run it synchronously 3) Go to 1) """ def __init__(self, opts): multiprocessing.Process.__init__(self, name="action-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.action_dispatcher", "action_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) def update_process_title(self, msg=None): proc_title = "Action dispatcher" if msg: proc_title += " - " + msg setproctitle(proc_title) def get_frontend_actions(self): """ Get unfiltered list of actions from frontend, both running and pending. """ try: raw_actions = self.frontend_client.get('pending-actions').json() except (FrontendClientException, ValueError) as error: self.log.exception( "Retrieving an action tasks failed with error: %s", error) return [] return [ActionQueueTask(action['id']) for action in raw_actions] def run(self): """ Executes action dispatching process. """ self.log.info("Action dispatching started.") self.update_process_title() redis = get_redis_connection(self.opts) worker_manager = ActionWorkerManager( redis_connection=redis, log=self.log, max_workers=self.opts.actions_max_workers) worker_manager.frontend_client = FrontendClient(self.opts, self.log) timeout = self.opts.sleeptime while True: self.log.info("getting actions from frontend") start = time.time() for task in self.get_frontend_actions(): worker_manager.add_task(task) # Execute the actions. worker_manager.run(timeout=timeout) sleep_more = timeout - (time.time() - start) if sleep_more > 0: time.sleep(sleep_more)
class BuildDispatcher(multiprocessing.Process): """ 1) Fetch build task from frontend 2) Get a free VM for it 3) Create a worker for the job 4) Start it asynchronously and go to 1) """ def __init__(self, opts): multiprocessing.Process.__init__(self, name="build-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) self.vm_manager = VmManager(self.opts) # Maps e.g. x86_64 && i386 => PC self.arch_to_group = dict() # PC => max N builders per user self.group_to_usermax = dict() self.init_internal_structures() def get_vm_group_id(self, arch): try: return self.arch_to_group[arch] except KeyError: raise DispatchBuildError("Unknown architecture {0}".format(arch)) def update_process_title(self, msg=None): proc_title = "Build dispatcher" if msg: proc_title += " - " + msg setproctitle(proc_title) def init_internal_structures(self): self.arch_to_group = dict() self.group_to_usermax = dict() for group in self.opts.build_groups: group_id = group["id"] for arch in group["archs"]: self.arch_to_group[arch] = group_id self.log.debug("mapping {0} to {1} group".format( arch, group_id)) self.log.debug("user might use only {0}VMs for {1} group".format( group["max_vm_per_user"], group_id)) self.group_to_usermax[group_id] = group["max_vm_per_user"] def load_job(self): """ Retrieve a single build job from frontend. """ self.log.info("Waiting for a job from frontend...") get_task_init_time = time.time() task = None while not task: self.update_process_title( "Waiting for a job from frontend for {} s".format( int(time.time() - get_task_init_time))) try: r = get("{0}/backend/waiting/".format( self.opts.frontend_base_url), auth=("user", self.opts.frontend_auth)) task = r.json().get("build") except (RequestException, ValueError) as error: self.log.exception( "Retrieving build job from {} failed with error: {}". format(self.opts.frontend_base_url, error)) finally: if not task: time.sleep(self.opts.sleeptime) self.log.info("Got new build job {}".format(task['task_id'])) return BuildJob(task, self.opts) def can_build_start(self, job): """ Announce to the frontend that the build is going to start so that it can confirm that and draw out another job for building. Returns ------- True if the build can start False if the build can not start (build is cancelled) """ try: can_build_start = self.frontend_client.starting_build( job.build_id, job.chroot) except (RequestException, ValueError) as error: self.log.exception( "Communication with Frontend to confirm build start failed with error: {}" .format(error)) return False if not can_build_start: self.log.exception("Frontend forbade to start the job {}".format( job.task_id)) return can_build_start def clean_finished_workers(self, workers): for worker in workers: if not worker.is_alive(): worker.join(5) workers.remove(worker) self.log.info("Removed finished worker {} for job {}".format( worker.worker_id, worker.job.task_id)) def run(self): """ Executes build dispatching process. """ self.log.info("Build dispatching started.") self.update_process_title() workers = [] next_worker_id = 1 while True: self.clean_finished_workers(workers) job = self.load_job() try: self.log.info("Acquiring VM for job {}...".format(str(job))) vm_group_id = self.get_vm_group_id(job.arch) vm = self.vm_manager.acquire_vm(vm_group_id, job.project_owner, os.getpid(), job.task_id, job.build_id, job.chroot) except NoVmAvailable as error: self.log.info( "No available resources for task {} (Reason: {}). Deferring job." .format(job.task_id, error)) self.frontend_client.defer_build(job.build_id, job.chroot) continue else: self.log.info("VM {} for job {} successfully acquired".format( vm.vm_name, job.task_id)) if not self.can_build_start(job): self.vm_manager.release_vm(vm.vm_name) continue worker = Worker(opts=self.opts, frontend_client=self.frontend_client, vm_manager=self.vm_manager, worker_id=next_worker_id, vm=vm, job=job) workers.append(worker) worker.start() self.log.info("Started new worker {} for job {}".format( worker.worker_id, worker.job.task_id)) next_worker_id = (next_worker_id + 1) % 2**15
class Pruner(object): def __init__(self, opts, cmdline_opts=None): self.opts = opts self.prune_days = getattr(self.opts, "prune_days", DEF_DAYS) self.chroots = {} self.frontend_client = FrontendClient(self.opts) self.mtime_optimization = True if cmdline_opts: self.mtime_optimization = not cmdline_opts.no_mtime_optimization def run(self): response = self.frontend_client._post_to_frontend_repeatedly("", "chroots-prunerepo-status") self.chroots = json.loads(response.content) results_dir = self.opts.destdir loginfo("Pruning results dir: {} ".format(results_dir)) user_dir_names, user_dirs = list_subdir(results_dir) loginfo("Going to process total number: {} of user's directories".format(len(user_dir_names))) loginfo("Going to process user's directories: {}".format(user_dir_names)) loginfo("--------------------------------------------") for username, subpath in zip(user_dir_names, user_dirs): loginfo("For user `{}` exploring path: {}".format(username, subpath)) for projectdir, project_path in zip(*list_subdir(subpath)): loginfo("Exploring projectdir `{}` with path: {}".format(projectdir, project_path)) self.prune_project(project_path, username, projectdir) loginfo("--------------------------------------------") loginfo("Setting final_prunerepo_done for deactivated chroots") chroots_to_prune = [] for chroot, active in self.chroots.items(): if not active: chroots_to_prune.append(chroot) self.frontend_client._post_to_frontend_repeatedly(chroots_to_prune, "final-prunerepo-done") loginfo("--------------------------------------------") loginfo("Pruning finished") def prune_project(self, project_path, username, projectdir): loginfo("Going to prune {}/{}".format(username, projectdir)) projectname = projectdir.split(':', 1)[0] loginfo("projectname = {}".format(projectname)) try: if not get_auto_createrepo_status(self.opts.frontend_base_url, username, projectname): loginfo("Skipped {}/{} since auto createrepo option is disabled" .format(username, projectdir)) return if get_persistent_status(self.opts.frontend_base_url, username, projectname): loginfo("Skipped {}/{} since the project is persistent" .format(username, projectdir)) return if not get_auto_prune_status(self.opts.frontend_base_url, username, projectname): loginfo("Skipped {}/{} since auto-prunning is disabled for the project" .format(username, projectdir)) return except (CoprException, CoprRequestException) as exception: logerror("Failed to get project details for {}/{} with error: {}".format( username, projectdir, exception)) return for sub_dir_name in os.listdir(project_path): chroot_path = os.path.join(project_path, sub_dir_name) if sub_dir_name == 'modules': continue if not os.path.isdir(chroot_path): continue if sub_dir_name not in self.chroots: loginfo("Final pruning already done for chroot {}/{}:{}".format(username, projectdir, sub_dir_name)) continue if self.mtime_optimization: # We only ever remove builds that were done at least # 'self.prune_days' ago. And because we run prunerepo _daily_ # we know that the candidates for removal (if there are such) # are removed about a day after "build_time + self.prune_days". touched_before = time.time()-os.stat(chroot_path).st_mtime touched_before = touched_before/3600/24 # seconds -> days # Because it might happen that prunerepo has some problems to # successfully go through the directory for some time (bug, user # error, I/O problems...) we rather wait 10 more days till we # really start to ignore the directory. if touched_before > int(self.prune_days) + 10: loginfo("Skipping {} - not changed for {} days".format( sub_dir_name, touched_before)) continue try: cmd = ['prunerepo', '--verbose', '--days', str(self.prune_days), '--nocreaterepo', chroot_path] stdout = runcmd(cmd) loginfo(stdout) createrepo(path=chroot_path, front_url=self.opts.frontend_base_url, username=username, projectname=projectname, override_acr_flag=True) clean_copr(chroot_path, self.prune_days, verbose=True) except Exception as err: logexception(err) logerror("Error pruning chroot {}/{}:{}".format(username, projectdir, sub_dir_name)) loginfo("Pruning done for chroot {}/{}:{}".format(username, projectdir, sub_dir_name)) loginfo("Pruning finished for projectdir {}/{}".format(username, projectdir))
class TestFrontendClient(object): def setup_method(self, method): self.opts = Munch( frontend_base_url="http://example.com/", frontend_auth="12345678", ) self.fc = FrontendClient(self.opts) self.data = { "foo": "bar", "bar": [1, 3, 5], } self.url_path = "sub_path" self.build_id = 12345 self.task_id = "12345-fedora-20-x86_64" self.chroot_name = "fedora-20-x86_64" @pytest.fixture def mask_post_to_fe(self): self.ptf = MagicMock() self.fc._post_to_frontend = self.ptf def test_post_to_frontend(self, post_req): post_req.return_value.status_code = 200 self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_not_200(self, post_req): post_req.return_value.status_code = 501 with pytest.raises(RequestException): self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_post_error(self, post_req): post_req.side_effect = RequestException() with pytest.raises(RequestException): self.fc._post_to_frontend(self.data, self.url_path) assert post_req.called def test_post_to_frontend_repeated_first_try_ok(self, mask_post_to_fe, mc_time): response = "ok\n" self.ptf.return_value = response assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert not mc_time.sleep.called def test_post_to_frontend_repeated_second_try_ok(self, mask_post_to_fe, mc_time): response = "ok\n" self.ptf.side_effect = [ RequestException(), response, ] assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert mc_time.sleep.called def test_post_to_frontend_repeated_all_attempts_failed(self, mask_post_to_fe, mc_time): self.ptf.side_effect = RequestException() with pytest.raises(RequestException): self.fc._post_to_frontend_repeatedly(self.data, self.url_path) assert mc_time.sleep.called def test_update(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.update(self.data) assert ptfr.call_args == mock.call(self.data, "update") def test_starting_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr for val in [True, False]: ptfr.return_value.json.return_value = {"can_start": val} assert self.fc.starting_build(self.data) == val def test_starting_build_err(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr with pytest.raises(RequestException): self.fc.starting_build(self.data) def test_starting_build_err_2(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr ptfr.return_value.json.return_value = {} with pytest.raises(RequestException): self.fc.starting_build(self.data) def test_reschedule_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.reschedule_build(self.build_id, self.task_id, self.chroot_name) expected = mock.call({'build_id': self.build_id, 'task_id': self.task_id, 'chroot': self.chroot_name}, 'reschedule_build_chroot') assert ptfr.call_args == expected
class BuildDispatcher(multiprocessing.Process): """ 1) Fetch build tasks from frontend 2) Loop through them and try to allocate VM for each - If VM can be allocated, spawn a worker and run it asynchronously - otherwise, check the next build task 3) Go to 1 """ def __init__(self, opts): multiprocessing.Process.__init__(self, name="build-dispatcher") self.opts = opts self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher") self.frontend_client = FrontendClient(self.opts, self.log) self.vm_manager = VmManager(self.opts) self.workers = [] self.next_worker_id = 1 self.arch_to_groups = defaultdict(list) # PC => max N builders per user self.group_to_usermax = dict() self.job_ids_previous_request = set() self.init_internal_structures() def get_vm_group_ids(self, arch): if not arch: return [group["id"] for group in self.opts.build_groups] try: return self.arch_to_groups[arch] except KeyError: raise DispatchBuildError("Unknown architecture {0}".format(arch)) def update_process_title(self, msg=None): proc_title = "Build dispatcher" if msg: proc_title += " - " + msg setproctitle(proc_title) def init_internal_structures(self): for group in self.opts.build_groups: group_id = group["id"] for arch in group["archs"]: self.arch_to_groups[arch].append(group_id) self.log.debug("mapping %s to %s group", arch, group_id) self.log.debug("user might use only %sVMs for %s group", group["max_vm_per_user"], group_id) self.group_to_usermax[group_id] = group["max_vm_per_user"] def load_jobs(self): """ Retrieve a single build job from frontend. """ self.log.info("Waiting for a job from frontend...") get_task_init_time = time.time() tasks = None while not tasks: self.update_process_title( "Waiting for jobs from frontend for {} s".format( int(time.time() - get_task_init_time))) try: tasks = self.frontend_client.get('pending-jobs').json() except (FrontendClientException, ValueError) as error: self.log.exception( "Retrieving build jobs from %s failed with error: %s", self.opts.frontend_base_url, error) finally: if not tasks: time.sleep(self.opts.sleeptime) job_ids = {task.get("task_id") for task in tasks if task} new_ids = job_ids - self.job_ids_previous_request if new_ids: self.log.info("Got new build jobs: %s", new_ids) self.job_ids_previous_request = job_ids return [BuildJob(task, self.opts) for task in tasks if task] def can_build_start(self, job): """ Announce to the frontend that the build is starting. Frontend may reject build to start. Returns ------- True if the build can start False if the build can not start (build is cancelled) """ try: job.started_on = time.time() job.status = BuildStatus.STARTING can_build_start = self.frontend_client.starting_build( job.to_dict()) except (FrontendClientException, ValueError) as error: self.log.exception( "Communication with Frontend to confirm build start failed with error: %s", error) return False if not can_build_start: self.log.exception("Frontend forbade to start the job %s", job.task_id) return can_build_start def clean_finished_workers(self): for worker in self.workers: if not worker.is_alive(): worker.join(5) self.workers.remove(worker) self.log.info("Removed finished worker %s for job %s", worker.worker_id, worker.job.task_id) def start_worker(self, vm, job, reattach=False): worker = Worker(opts=self.opts, vm_manager=self.vm_manager, worker_id=self.next_worker_id, vm=vm, job=job, reattach=reattach) self.workers.append(worker) self.next_worker_id = (self.next_worker_id + 1) % 2**15 worker.start() return worker def run(self): """ Executes build dispatching process. """ self.log.info("Build dispatching started.") self.update_process_title() first_backend_loop = True while True: self.clean_finished_workers() skip_jobs_cache = {} for job in self.load_jobs(): # first check if we do not have # worker already running for the job if any([job.task_id == w.job.task_id for w in self.workers]): self.log.debug("Skipping already running task '%s'", job.task_id) continue if first_backend_loop: # Server was restarted. Some builds might be running on # background on builders; so search db builder records for # the job and if we found it, spawn a worker to reattach. vm = self.vm_manager.get_vm_by_task_id(job.task_id) if vm and vm.state == 'in_use': self.log.info("Reattaching to VM: " + str(vm)) worker = self.start_worker(vm, job, reattach=True) vm.store_field(self.vm_manager.rc, "used_by_worker", worker.worker_id) self.log.info("Reattached new worker %s for job %s", worker.worker_id, worker.job.task_id) continue cache_entry = '{owner}-{arch}-{sandbox}'.format( owner=job.project_owner, arch=job.arch or "noarch", sandbox=job.sandbox, ) if cache_entry in skip_jobs_cache: self.log.debug("Skipped job %s, cached", job) continue # ... and if the task is new to us, # allocate new vm and run full build try: vm_group_ids = self.get_vm_group_ids(job.arch) self.log.debug("Picking VM from groups %s for job %s", vm_group_ids, job) vm = self.vm_manager.acquire_vm(vm_group_ids, job.project_owner, job.sandbox, self.next_worker_id, job.task_id, job.build_id, job.chroot) except NoVmAvailable as error: skip_jobs_cache[cache_entry] = True self.log.debug( "No available resources for task %s (Reason: %s). Deferring job.", job.task_id, error) continue else: self.log.info("VM %s for job %s successfully acquired", vm.vm_name, job.task_id) if not self.can_build_start(job): self.vm_manager.release_vm(vm.vm_name) continue worker = self.start_worker(vm, job) self.log.info("Started new worker %s for job %s", worker.worker_id, worker.job.task_id) first_backend_loop = False time.sleep(self.opts.sleeptime)
class TestFrontendClient(object): def setup_method(self, method): self.opts = Munch( frontend_base_url="http://example.com/", frontend_auth="12345678", ) self.fc = FrontendClient(self.opts) self.data = { "foo": "bar", "bar": [1, 3, 5], } self.url_path = "sub_path" self.build_id = 12345 self.task_id = "12345-fedora-20-x86_64" self.chroot_name = "fedora-20-x86_64" @pytest.fixture def mask_frontend_request(self): self.f_r = MagicMock() self.fc._frontend_request = self.f_r def test_post_to_frontend(self, f_request_method): name, method = f_request_method method.return_value.status_code = 200 self.fc._frontend_request(self.url_path, self.data, method=name) assert method.called def test_post_to_frontend_wrappers(self, f_request_method): name, method = f_request_method method.return_value.status_code = 200 call = getattr(self.fc, name) if name == 'get': call(self.url_path) else: call(self.url_path, self.data) assert method.called def test_post_to_frontend_not_200(self, post_req): post_req.return_value.status_code = 501 with pytest.raises(FrontendClientRetryError): self.fc._frontend_request(self.url_path, self.data) assert post_req.called def test_post_to_frontend_post_error(self, post_req): post_req.side_effect = RequestException() with pytest.raises(FrontendClientRetryError): self.fc._frontend_request(self.url_path, self.data) assert post_req.called def test_post_to_frontend_repeated_first_try_ok(self, mask_frontend_request, mc_time): response = "ok\n" self.f_r.return_value = response mc_time.time.return_value = 0 assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response assert not mc_time.sleep.called def test_post_to_frontend_repeated_second_try_ok(self, f_request_method, mask_frontend_request, mc_time): method_name, method = f_request_method response = "ok\n" self.f_r.side_effect = [ FrontendClientRetryError(), response, ] mc_time.time.return_value = 0 assert self.fc._frontend_request_repeatedly( self.url_path, data=self.data, method=method_name) == response assert mc_time.sleep.called def test_post_to_frontend_err_400(self, post_req, mc_time): response = Response() response.status_code = 404 response.reason = 'NOT FOUND' post_req.side_effect = [ FrontendClientRetryError(), response, ] mc_time.time.return_value = 0 with pytest.raises(FrontendClientException): assert self.fc._post_to_frontend_repeatedly( self.data, self.url_path) == response assert mc_time.sleep.called @mock.patch('backend.frontend.BACKEND_TIMEOUT', 100) def test_post_to_frontend_repeated_all_attempts_failed( self, mask_frontend_request, caplog, mc_time): mc_time.time.side_effect = [ 0, 0, 5, 5 + 10, 5 + 10 + 15, 5 + 10 + 15 + 20, 1000 ] self.f_r.side_effect = FrontendClientRetryError() with pytest.raises(FrontendClientException): self.fc._post_to_frontend_repeatedly(self.data, self.url_path) assert mc_time.sleep.call_args_list == [ mock.call(x) for x in [5, 10, 15, 20, 25] ] assert len(caplog.records) == 5 def test_post_to_frontend_repeated_indefinitely(self, mask_frontend_request, caplog, mc_time): mc_time.time.return_value = 1 self.fc.try_indefinitely = True self.f_r.side_effect = [FrontendClientRetryError() for _ in range(100)] \ + [FrontendClientException()] # e.g. 501 eventually with pytest.raises(FrontendClientException): self.fc._post_to_frontend_repeatedly(self.data, self.url_path) assert mc_time.sleep.called assert len(caplog.records) == 100 def test_reschedule_300(self, mask_frontend_request, post_req): response = Response() response.status_code = 302 response.reason = 'whatever' post_req.side_effect = response with pytest.raises(FrontendClientException) as ex: self.fc.reschedule_all_running() assert 'Failed to reschedule builds' in str(ex) def test_update(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.update(self.data) assert ptfr.call_args == mock.call(self.data, "update") def test_starting_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr for val in [True, False]: ptfr.return_value.json.return_value = {"can_start": val} assert self.fc.starting_build(self.data) == val def test_starting_build_err(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr with pytest.raises(FrontendClientException): self.fc.starting_build(self.data) def test_starting_build_err_2(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr ptfr.return_value.json.return_value = {} with pytest.raises(FrontendClientException): self.fc.starting_build(self.data) def test_reschedule_build(self): ptfr = MagicMock() self.fc._post_to_frontend_repeatedly = ptfr self.fc.reschedule_build(self.build_id, self.task_id, self.chroot_name) expected = mock.call( { 'build_id': self.build_id, 'task_id': self.task_id, 'chroot': self.chroot_name }, 'reschedule_build_chroot') assert ptfr.call_args == expected
class CoprBackend(object): """ Core process - starts/stops/initializes workers :param config_file: path to the backend configuration file :param ext_opts: additional options for backend """ def __init__(self, config_file=None, ext_opts=None): # read in config file # put all the config items into a single self.opts munch if not config_file: raise CoprBackendError("Must specify config_file") self.config_file = config_file self.ext_opts = ext_opts # to stow our cli options for read_conf() self.workers_by_group_id = defaultdict(list) self.max_worker_num_by_group_id = defaultdict(int) self.config_reader = BackendConfigReader(self.config_file, self.ext_opts) self.opts = None self.update_conf() self.task_queues = {} self.frontend_client = FrontendClient(self.opts) self.is_running = False self.log = get_redis_logger(self.opts, "backend.main", "backend") def clean_task_queues(self): """ Make sure there is nothing in our task queues """ try: for queue in self.task_queues.values(): while queue.length: queue.dequeue() except ConnectionError: raise CoprBackendError( "Could not connect to a task queue. Is Redis running?") def init_task_queues(self): """ Connect to the retask.Queue for each group_id. Remove old tasks from queues. """ try: for group in self.opts.build_groups: group_id = group["id"] queue = Queue("copr-be-{0}".format(group_id)) queue.connect() self.task_queues[group_id] = queue except ConnectionError: raise CoprBackendError( "Could not connect to a task queue. Is Redis running?") self.clean_task_queues() def update_conf(self): """ Update backend config from config file """ self.opts = self.config_reader.read() def spin_up_workers_by_group(self, group): """ Handles starting/growing the number of workers :param dict group: Builders group Utilized keys: - **id** - **max_workers** """ group_id = group["id"] if len(self.workers_by_group_id[group_id]) < group["max_workers"]: self.log.info("Spinning up more workers") for _ in range(group["max_workers"] - len(self.workers_by_group_id[group_id])): self.max_worker_num_by_group_id[group_id] += 1 try: w = Worker( opts=self.opts, frontend_client=self.frontend_client, worker_num=self.max_worker_num_by_group_id[group_id], group_id=group_id ) self.workers_by_group_id[group_id].append(w) w.start() time.sleep(0.3) self.log.info("Started worker: {} for group: {}".format(w.worker_num, group_id)) except Exception as error: self.log.exception("Failed to start new Worker: {}".format(error)) self.log.info("Finished starting worker processes") def prune_dead_workers_by_group_id(self, group_id): """ Removes dead workers from the pool :return list: alive workers :raises: :py:class:`~backend.exceptions.CoprBackendError` when got dead worker and option "exit_on_worker" is enabled """ preserved_workers = [] for w in self.workers_by_group_id[group_id]: if not w.is_alive(): self.log.warn("Worker {} died unexpectedly".format(w.worker_num)) w.terminate() # kill it with a fire if self.opts.exit_on_worker: raise CoprBackendError( "Worker died unexpectedly, exiting") else: preserved_workers.append(w) return preserved_workers def terminate(self): """ Cleanup backend processes (just workers for now) And also clean all task queues as they would survive copr restart """ self.is_running = False for group in self.opts.build_groups: group_id = group["id"] for w in self.workers_by_group_id[group_id][:]: self.workers_by_group_id[group_id].remove(w) w.terminate_instance() self.clean_task_queues() try: self.log.info("Rescheduling unfinished builds before stop") self.frontend_client.reschedule_all_running() except RequestException as err: self.log.exception(err) return def run(self): """ Starts backend process. Control sub process start/stop. """ self.update_conf() self.init_task_queues() time.sleep(1) self.log.info("Initial config: {}".format(self.opts)) try: self.log.info("Rescheduling old unfinished builds") self.frontend_client.reschedule_all_running() except RequestException as err: self.log.exception(err) return self.is_running = True while self.is_running: # re-read config into opts self.update_conf() for group in self.opts.build_groups: group_id = group["id"] self.spin_up_workers_by_group(group) # FIXME - prune out workers # if len(self.workers) > self.opts.num_workers: # killnum = len(self.workers) - self.opts.num_workers # for w in self.workers[:killnum]: # insert a poison pill? Kill after something? I dunno. # FIXME - if a worker bombs out - we need to check them # and startup a new one if it happens # check for dead workers and abort preserved_workers = self.prune_dead_workers_by_group_id(group_id) self.workers_by_group_id[group_id] = preserved_workers time.sleep(self.opts.sleeptime)