Example #1
0
 def __init__(self, opts, cmdline_opts=None):
     self.opts = opts
     self.prune_days = getattr(self.opts, "prune_days", DEF_DAYS)
     self.chroots = {}
     self.frontend_client = FrontendClient(self.opts)
     self.mtime_optimization = True
     if cmdline_opts:
         self.mtime_optimization = not cmdline_opts.no_mtime_optimization
Example #2
0
    def __init__(self, config_file=None, ext_opts=None):
        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to show our cli options for read_conf()

        self.config_reader = BackendConfigReader(self.config_file,
                                                 self.ext_opts)
        self.opts = None
        self.update_conf()

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
        self.frontend_client = FrontendClient(self.opts, self.log)
Example #3
0
    def setup_method(self, method):
        self.opts = Munch(
            frontend_base_url="http://example.com/",
            frontend_auth="12345678",
        )
        self.fc = FrontendClient(self.opts)

        self.data = {
            "foo": "bar",
            "bar": [1, 3, 5],
        }
        self.url_path = "sub_path"

        self.build_id = 12345
        self.chroot_name = "fedora-20-x86_64"
Example #4
0
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher",
                                    "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)

        # Maps e.g. x86_64 && i386 => PC
        self.arch_to_group = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()

        self.init_internal_structures()
Example #5
0
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)
        self.workers = []
        self.next_worker_id = 1

        self.arch_to_groups = defaultdict(list)
        # PC => max N builders per user
        self.group_to_usermax = dict()

        self.init_internal_structures()
Example #6
0
    def run(self):
        """
        Executes action dispatching process.
        """
        self.log.info("Action dispatching started.")
        self.update_process_title()

        redis = get_redis_connection(self.opts)
        worker_manager = ActionWorkerManager(
            redis_connection=redis,
            log=self.log,
            max_workers=self.opts.actions_max_workers)
        worker_manager.frontend_client = FrontendClient(self.opts, self.log)

        timeout = self.opts.sleeptime

        while True:
            self.log.info("getting actions from frontend")
            start = time.time()
            for task in self.get_frontend_actions():
                worker_manager.add_task(task)

            # Execute the actions.
            worker_manager.run(timeout=timeout)

            sleep_more = timeout - (time.time() - start)
            if sleep_more > 0:
                time.sleep(sleep_more)
Example #7
0
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="action-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.action_dispatcher",
                                    "action_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
Example #8
0
class CoprBackend(object):
    """
    Core process - starts/stops dispatchers for actions and builds

    :param config_file: path to the backend configuration file
    :param ext_opts: additional options for backend
    """
    def __init__(self, config_file=None, ext_opts=None):
        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to show our cli options for read_conf()

        self.config_reader = BackendConfigReader(self.config_file,
                                                 self.ext_opts)
        self.opts = None
        self.update_conf()

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
        self.frontend_client = FrontendClient(self.opts, self.log)

    def update_conf(self):
        """
        Update backend config from config file
        """
        self.opts = self.config_reader.read()

    def run(self):
        """
        Starts backend process. Control sub process start/stop.
        """
        self.update_conf()
        self.log.info("Initial config: {}".format(self.opts))

        try:
            self.log.info("Rescheduling old unfinished builds")
            self.frontend_client.reschedule_all_running(120)  # 10 minutes
        except RequestException as err:
            self.log.exception(err)
            raise CoprBackendError(err)

        build_dispatcher = BuildDispatcher(self.opts)
        action_dispatcher = ActionDispatcher(self.opts)

        build_dispatcher.start()
        action_dispatcher.start()
Example #9
0
class CoprBackend(object):
    """
    Core process - starts/stops dispatchers for actions and builds

    :param config_file: path to the backend configuration file
    :param ext_opts: additional options for backend
    """

    def __init__(self, config_file=None, ext_opts=None):
        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to show our cli options for read_conf()

        self.config_reader = BackendConfigReader(self.config_file, self.ext_opts)
        self.opts = None
        self.update_conf()

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
        self.frontend_client = FrontendClient(self.opts, self.log)

    def update_conf(self):
        """
        Update backend config from config file
        """
        self.opts = self.config_reader.read()

    def run(self):
        """
        Starts backend process. Control sub process start/stop.
        """
        self.update_conf()
        self.log.info("Initial config: {}".format(self.opts))

        try:
            self.log.info("Rescheduling old unfinished builds")
            self.frontend_client.reschedule_all_running(120) # 10 minutes
        except RequestException as err:
            self.log.exception(err)
            raise CoprBackendError(err)

        build_dispatcher = BuildDispatcher(self.opts)
        action_dispatcher = ActionDispatcher(self.opts)

        build_dispatcher.start()
        action_dispatcher.start()
Example #10
0
    def __init__(self, opts):
        """ base class initialization """

        self.opts = opts

        # Maps e.g. x86_64 && i386 => PC (.
        self.arch_to_group_id_map = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()
        # task_id -> task dict
        self.added_jobs_dict = dict()

        self.rc = None
        self.channel = None
        self.ps_thread = None

        self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab")
        self.jg_control = jobgrabcontrol.Channel(self.opts, self.log)
        self.frontend_client = FrontendClient(self.opts, self.log)
Example #11
0
class CoprBackend(object):
    """
    COPR backend head process.

    :param config_file: path to the backend configuration file
    :param ext_opts: additional options for backend
    """
    def __init__(self, config_file=None, ext_opts=None):
        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to show our cli options for read_conf()

        self.config_reader = BackendConfigReader(self.config_file,
                                                 self.ext_opts)
        self.opts = None
        self.update_conf()

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
        self.frontend_client = FrontendClient(self.opts, self.log)

    def update_conf(self):
        """
        Update backend config from config file
        """
        self.opts = self.config_reader.read()

    def run(self):
        """
        Starts backend process. Control sub process start/stop.
        """
        self.update_conf()
        self.log.info("Initial config: %s", self.opts)

        try:
            self.log.info("Rescheduling old unfinished builds")
            self.frontend_client.reschedule_all_running()
        except FrontendClientException as err:
            self.log.exception(err)
            raise CoprBackendError(err)
Example #12
0
    def __init__(self, config_file=None, ext_opts=None):
        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to show our cli options for read_conf()

        self.config_reader = BackendConfigReader(self.config_file, self.ext_opts)
        self.opts = None
        self.update_conf()

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
        self.frontend_client = FrontendClient(self.opts, self.log)
Example #13
0
    def __init__(self, config_file=None, ext_opts=None):
        # read in config file
        # put all the config items into a single self.opts munch

        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to stow our cli options for read_conf()
        self.workers_by_group_id = defaultdict(list)
        self.max_worker_num_by_group_id = defaultdict(int)

        self.config_reader = BackendConfigReader(self.config_file,
                                                 self.ext_opts)
        self.opts = None
        self.update_conf()

        self.task_queues = {}

        self.frontend_client = FrontendClient(self.opts)
        self.is_running = False

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
Example #14
0
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)

        # Maps e.g. x86_64 && i386 => PC
        self.arch_to_group = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()

        self.init_internal_structures()
Example #15
0
    def setup_method(self, method):
        self.opts = Munch(
            frontend_base_url="http://example.com/",
            frontend_auth="12345678",
        )
        self.fc = FrontendClient(self.opts)

        self.data = {
            "foo": "bar",
            "bar": [1, 3, 5],
        }
        self.url_path = "sub_path"

        self.build_id = 12345
        self.chroot_name = "fedora-20-x86_64"
Example #16
0
    def __init__(self, opts):
        """ base class initialization """

        self.opts = opts

        # Maps e.g. x86_64 && i386 => PC (.
        self.arch_to_group_id_map = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()
        # task_id -> task dict
        self.added_jobs_dict = dict()

        self.rc = None
        self.channel = None
        self.ps_thread = None

        self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab")
        self.jg_control = jobgrabcontrol.Channel(self.opts, self.log)
        self.frontend_client = FrontendClient(self.opts, self.log)
Example #17
0
    def __init__(self, config_file=None, ext_opts=None):
        # read in config file
        # put all the config items into a single self.opts munch

        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to stow our cli options for read_conf()
        self.workers_by_group_id = defaultdict(list)
        self.max_worker_num_by_group_id = defaultdict(int)

        self.config_reader = BackendConfigReader(self.config_file, self.ext_opts)
        self.opts = None
        self.update_conf()

        self.task_queues = {}

        self.frontend_client = FrontendClient(self.opts)
        self.is_running = False

        self.log = get_redis_logger(self.opts, "backend.main", "backend")
Example #18
0
class CoprJobGrab(object):

    """
    Fetch jobs from the Frontend

        - submit build task to the jobs queue for workers
        - run Action handler for action tasks


    :param Munch opts: backend config
    :param lock: :py:class:`multiprocessing.Lock` global backend lock

    TODO: Not yet fully ready for config reload.
    """

    def __init__(self, opts):
        """ base class initialization """

        self.opts = opts

        # Maps e.g. x86_64 && i386 => PC (.
        self.arch_to_group_id_map = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()
        # task_id -> task dict
        self.added_jobs_dict = dict()

        self.rc = None
        self.channel = None
        self.ps_thread = None

        self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab")
        self.jg_control = jobgrabcontrol.Channel(self.opts, self.log)
        self.frontend_client = FrontendClient(self.opts, self.log)


    def group(self, arch):
        try:
            return self.arch_to_group_id_map[arch]
        except KeyError:
            raise CoprJobGrabError("Unknown architecture {0}".format(arch))


    def listen_to_pubsub(self):
        """
        Listens for job reschedule queries. Spawns self.ps_thread, don't forget to stop it.
        """
        self.rc = get_redis_connection(self.opts)
        self.channel = self.rc.pubsub(ignore_subscribe_messages=True)

        self.channel.subscribe(**{JOB_GRAB_TASK_END_PUBSUB: self.on_pubsub_event})
        self.ps_thread = self.channel.run_in_thread(sleep_time=0.05)

        self.log.info("Subscribed to {} channel".format(JOB_GRAB_TASK_END_PUBSUB))


    def route_build_task(self, task):
        """
        Route build task to the appropriate queue.
        :param task: dict-like object which represent build task

        Utilized **task** keys:

            - ``task_id``
            - ``chroot``
            - ``arch``

        :return int: Count of the successfully routed tasks
        """
        count = 0
        if "task_id" in task:
            if task["task_id"] not in self.added_jobs_dict:
                arch = task["chroot"].split("-")[2]
                group = self.group(arch)

                username = task["project_owner"]
                active_jobs_count = len([t for t_id, t in self.added_jobs_dict.items()
                                         if t["project_owner"] == username])

                if active_jobs_count > self.group_to_usermax[group]:
                    self.log.debug("User can not acquire more VM (active builds #{0}), "
                                   "don't schedule more tasks".format(active_jobs_count))
                    return 0

                msg = "enqueue task for user {0}: id={1}, arch={2}, group={3}, active={4}"
                self.log.debug(msg.format(username, task["task_id"], arch, group, active_jobs_count))

                # Add both to local list and control channel queue.
                self.added_jobs_dict[task["task_id"]] = task
                self.jg_control.add_build(group, task)
                count += 1

        else:
            self.log.info("Task missing field `task_id`, raw task: {}".format(task))
        return count

    def process_action(self, action):
        """
        Run action task handler, see :py:class:`~backend.action.Action`

        :param action: dict-like object with action task
        """
        ao = Action(self.opts, action, frontend_client=self.frontend_client)
        ao.run()

    def load_tasks(self):
        """
        Retrieve tasks from frontend and runs appropriate handlers
        """
        try:
            r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url),
                    auth=("user", self.opts.frontend_auth))
        except RequestException as e:
            self.log.exception("Error retrieving jobs from {}: {}"
                               .format(self.opts.frontend_base_url, e))
            return

        try:
            r_json = r.json()
        except ValueError as e:
            self.log.exception("Error getting JSON build list from FE {0}".format(e))
            return

        if r_json.get("builds"):
            self.log.debug("{0} jobs returned".format(len(r_json["builds"])))
            count = 0
            for task in r_json["builds"]:
                try:
                    count += self.route_build_task(task)
                except CoprJobGrabError as err:
                    self.log.exception("Failed to enqueue new job: {} with error: {}".format(task, err))

            if count:
                self.log.info("New build jobs: %s" % count)

        if r_json.get("actions"):
            count = 0
            self.log.info("{0} actions returned".format(len(r_json["actions"])))

            for action in r_json["actions"]:
                start = time.time()
                try:
                    self.process_action(action)
                except Exception as error:
                    self.log.exception("Error during processing action `{}`: {}".format(action, error))
                if time.time() - start > 2*self.opts.sleeptime:
                    # we are processing actions for too long, stop and fetch everything again (including new builds)
                    break


    def on_pubsub_event(self, raw):
        # from celery.contrib import rdb; rdb.set_trace()
        if raw is None:
            return
        if "type" not in raw or raw["type"] != "message":
            self.log.warn("Missing type or wrong type in pubsub msg: {}, ignored".format(raw))
            return
        try:
            msg = json.loads(raw["data"])
            # msg: {"action": ("remove"|"reschedule"), "task_id": ..., "build_id"..., "chroot": ...}
            # Actions: "remove" simply remove `task_id` from self.added_job
            #          "reschedule" additionally call frontend and set pending state before removal
            if "action" not in msg:
                self.log.warn("Missing required field `action`, msg ignored: {}".format(msg))
                return
            action = msg["action"]
            if action not in ["remove", "reschedule"]:
                self.log.warn("Action `{}` not allowed, msg ignored: {} ".format(action, msg))
                return

            if "task_id" not in msg:
                self.log.warn("Missing required field `task_id`, msg ignored: {}".format(msg))
                return

            task_id = msg["task_id"]
            if action == "reschedule" and "build_id" in msg and "chroot" in msg:
                # TODO: dirty dependency to frontend, Job management should be re-done (
                self.log.info("Rescheduling task `{}`".format(task_id))
                self.frontend_client.reschedule_build(msg["build_id"], msg["chroot"])

            if task_id not in self.added_jobs_dict:
                self.log.debug("Task `{}` not present in added jobs,  msg ignored: {}".format(task_id, msg))
                return

            if action in ["remove", "reschedule"]:
                self.added_jobs_dict.pop(task_id)
                self.log.info("Removed task `{}` from added_jobs".format(task_id))

        except Exception as err:
            self.log.exception("Error receiving message from remove pubsub: raw msg: {}, error: {}"
                               .format(raw, err))

    def log_queue_info(self):
        if self.added_jobs_dict:
            self.log.debug("Added jobs after remove and load: {}".format(self.added_jobs_dict))
            self.log.debug("# of executed jobs: {}".format(len(self.added_jobs_dict)))


    def init_internal_structures(self):
        self.arch_to_group_id_map = dict()
        self.group_to_usermax = dict()
        for group in self.opts.build_groups:
            group_id = group["id"]
            for arch in group["archs"]:
                self.arch_to_group_id_map[arch] = group_id
                self.log.debug("mapping {0} to {1} group".format(arch, group_id))

            self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id))
            self.group_to_usermax[group_id] = group["max_vm_per_user"]

        self.added_jobs_dict = dict()


    def handle_control_channel(self):
        if not self.jg_control.backend_started():
            return
        self.log.info("backend gave us signal to start")
        self.init_internal_structures()
        self.jg_control.remove_all_builds()
        self.jg_control.job_graber_initialized()

    def run(self):
        """
        Starts job grabber process
        """
        setproctitle("CoprJobGrab")
        self.listen_to_pubsub()

        self.log.info("JobGrub started.")

        self.init_internal_structures()
        try:
            while True:
                try:
                    # This effectively delays job_grabbing until backend
                    # gives as signal to start.
                    self.handle_control_channel()
                    self.load_tasks()
                    self.log_queue_info()
                    time.sleep(self.opts.sleeptime)
                except Exception as err:
                    self.log.exception("Job Grab unhandled exception: {}".format(err))

        except KeyboardInterrupt:
            return

    def terminate(self):
        if self.ps_thread:
            self.ps_thread.stop()
            self.ps_thread.join()
        super(CoprJobGrab, self).terminate()
Example #19
0
class TestFrontendClient(object):

    def setup_method(self, method):
        self.opts = Munch(
            frontend_base_url="http://example.com/",
            frontend_auth="12345678",
        )
        self.fc = FrontendClient(self.opts)

        self.data = {
            "foo": "bar",
            "bar": [1, 3, 5],
        }
        self.url_path = "sub_path"

        self.build_id = 12345
        self.chroot_name = "fedora-20-x86_64"

    @pytest.fixture
    def mask_post_to_fe(self):
        self.ptf = MagicMock()
        self.fc._post_to_frontend = self.ptf

    def test_post_to_frontend(self, post_req):
        post_req.return_value.status_code = 200
        self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_not_200(self, post_req):
        post_req.return_value.status_code = 501
        with pytest.raises(RequestException):
            self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_post_error(self, post_req):
        post_req.side_effect = RequestException()
        with pytest.raises(RequestException):
            self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_repeated_first_try_ok(self, mask_post_to_fe, mc_time):
        response = "ok\n"
        self.ptf.return_value = response

        assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response
        assert not mc_time.sleep.called

    def test_post_to_frontend_repeated_second_try_ok(self, mask_post_to_fe, mc_time):
        response = "ok\n"
        self.ptf.side_effect = [
            RequestException(),
            response,
        ]

        assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response
        assert mc_time.sleep.called

    def test_post_to_frontend_repeated_all_attempts_failed(self, mask_post_to_fe, mc_time):
        self.ptf.side_effect = RequestException()

        with pytest.raises(RequestException):
            self.fc._post_to_frontend_repeatedly(self.data, self.url_path)

        assert mc_time.sleep.called

    def test_update(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.update(self.data)
        assert ptfr.call_args == mock.call(self.data, "update")

    def test_starting_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        for val in [True, False]:
            ptfr.return_value.json.return_value = {"can_start": val}

            assert self.fc.starting_build(self.build_id, self.chroot_name) == val

    def test_starting_build_err(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr

        with pytest.raises(RequestException):
            self.fc.starting_build(self.build_id, self.chroot_name)

    def test_starting_build_err_2(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        ptfr.return_value.json.return_value = {}

        with pytest.raises(RequestException):
            self.fc.starting_build(self.build_id, self.chroot_name)

    def test_reschedule_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.reschedule_build(self.build_id, self.chroot_name)
        expected = mock.call({'build_id': self.build_id, 'chroot': self.chroot_name},
                             'reschedule_build_chroot')
        assert ptfr.call_args == expected
Example #20
0
class BuildDispatcher(multiprocessing.Process):
    """
    1) Fetch build task from frontend
    2) Get a free VM for it
    3) Create a worker for the job
    4) Start it asynchronously and go to 1)
    """

    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher", "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)

        # Maps e.g. x86_64 && i386 => PC
        self.arch_to_group = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()

        self.init_internal_structures()

    def get_vm_group_id(self, arch):
        try:
            return self.arch_to_group[arch]
        except KeyError:
            raise DispatchBuildError("Unknown architecture {0}".format(arch))

    def update_process_title(self, msg=None):
        proc_title = "Build dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def init_internal_structures(self):
           self.arch_to_group = dict()
           self.group_to_usermax = dict()
           for group in self.opts.build_groups:
               group_id = group["id"]
               for arch in group["archs"]:
                   self.arch_to_group[arch] = group_id
                   self.log.debug("mapping {0} to {1} group".format(arch, group_id))

               self.log.debug("user might use only {0}VMs for {1} group".format(group["max_vm_per_user"], group_id))
               self.group_to_usermax[group_id] = group["max_vm_per_user"]

    def load_job(self):
        """
        Retrieve a single build job from frontend.
        """
        self.log.info("Waiting for a job from frontend...")
        get_task_init_time = time.time()

        task = None
        while not task:
            self.update_process_title("Waiting for a job from frontend for {} s"
                                      .format(int(time.time() - get_task_init_time)))
            try:
                r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url),
                        auth=("user", self.opts.frontend_auth))
                task = r.json().get("build")
            except (RequestException, ValueError) as error:
                self.log.exception("Retrieving build job from {} failed with error: {}"
                                   .format(self.opts.frontend_base_url, error))
            finally:
                if not task:
                    time.sleep(self.opts.sleeptime)

        self.log.info("Got new build job {}".format(task['task_id']))
        return BuildJob(task, self.opts)

    def acquire_vm_for_job(self, job, vm_group_id):
        return vm

    def can_build_start(self, job):
        """
        Announce to the frontend that the build is going to start so that
        it can confirm that and draw out another job for building.

        Returns
        -------
        True if the build can start
        False if the build can not start (build is cancelled)
        """
        try:
            can_build_start = self.frontend_client.starting_build(job.build_id, job.chroot)
        except (RequestException, ValueError) as error:
            self.log.exception("Communication with Frontend to confirm build start failed with error: {}".format(error))
            return False

        if not can_build_start:
            self.log.exception("Frontend forbade to start the job {}".format(self.job.task_id))

        return can_build_start

    def join_finished_workers(self, workers):
        for worker in workers:
            if not worker.is_alive():
                worker.join(5)
                workers.remove(worker)
                self.log.info("Removed finished worker {} for job {}"
                              .format(worker.worker_id, worker.job.task_id))

    def run(self):
        """
        Executes build dispatching process.
        """
        self.log.info("Build dispatching started.")
        self.update_process_title()

        workers = []
        next_worker_id = 1
        while True:
            self.join_finished_workers(workers)

            job = self.load_job()

            try:
                self.log.info("Acquiring VM for job {}...".format(str(job)))
                vm_group_id = self.get_vm_group_id(job.arch)
                vm = self.vm_manager.acquire_vm(vm_group_id, job.project_owner, os.getpid(),
                                                job.task_id, job.build_id, job.chroot)
            except NoVmAvailable as error:
                self.log.info("No available resources for task {} (Reason: {}). Deferring job."
                              .format(job.task_id, error))
                self.frontend_client.defer_build(job.build_id, job.chroot)
                continue
            else:
                self.log.info("VM {} for job {} successfully acquired".format(vm.vm_name, job.task_id))

            if not self.can_build_start(job):
                self.vm_manager.release_vm(vm.vm_name)
                continue

            worker = Worker(
                opts=self.opts,
                frontend_client=self.frontend_client,
                vm_manager=self.vm_manager,
                worker_id=next_worker_id,
                vm=vm, job=job
            )
            worker.start()
            workers.append(worker)
            self.log.info("Started new worker {} for job {}"
                          .format(worker.worker_id, worker.job.task_id))
            next_worker_id = (next_worker_id + 1) % 2**15
Example #21
0
class CoprJobGrab(object):
    """
    Fetch jobs from the Frontend

        - submit build task to the jobs queue for workers
        - run Action handler for action tasks


    :param Munch opts: backend config
    :param lock: :py:class:`multiprocessing.Lock` global backend lock

    TODO: Not yet fully ready for config reload.
    """
    def __init__(self, opts):
        """ base class initialization """

        self.opts = opts

        # Maps e.g. x86_64 && i386 => PC (.
        self.arch_to_group_id_map = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()
        # task_id -> task dict
        self.added_jobs_dict = dict()

        self.rc = None
        self.channel = None
        self.ps_thread = None

        self.log = get_redis_logger(self.opts, "backend.job_grab", "job_grab")
        self.jg_control = jobgrabcontrol.Channel(self.opts, self.log)
        self.frontend_client = FrontendClient(self.opts, self.log)

    def group(self, arch):
        try:
            return self.arch_to_group_id_map[arch]
        except KeyError:
            raise CoprJobGrabError("Unknown architecture {0}".format(arch))

    def listen_to_pubsub(self):
        """
        Listens for job reschedule queries. Spawns self.ps_thread, don't forget to stop it.
        """
        self.rc = get_redis_connection(self.opts)
        self.channel = self.rc.pubsub(ignore_subscribe_messages=True)

        self.channel.subscribe(
            **{JOB_GRAB_TASK_END_PUBSUB: self.on_pubsub_event})
        self.ps_thread = self.channel.run_in_thread(sleep_time=0.05)

        self.log.info(
            "Subscribed to {} channel".format(JOB_GRAB_TASK_END_PUBSUB))

    def route_build_task(self, task):
        """
        Route build task to the appropriate queue.
        :param task: dict-like object which represent build task

        Utilized **task** keys:

            - ``task_id``
            - ``chroot``
            - ``arch``

        :return int: Count of the successfully routed tasks
        """
        count = 0
        if "task_id" in task:
            if task["task_id"] not in self.added_jobs_dict:
                arch = task["chroot"].split("-")[2]
                group = self.group(arch)

                username = task["project_owner"]
                active_jobs_count = len([
                    t for t_id, t in self.added_jobs_dict.items()
                    if t["project_owner"] == username
                ])

                if active_jobs_count > self.group_to_usermax[group]:
                    self.log.debug(
                        "User can not acquire more VM (active builds #{0}), "
                        "don't schedule more tasks".format(active_jobs_count))
                    return 0

                msg = "enqueue task for user {0}: id={1}, arch={2}, group={3}, active={4}"
                self.log.debug(
                    msg.format(username, task["task_id"], arch, group,
                               active_jobs_count))

                # Add both to local list and control channel queue.
                self.added_jobs_dict[task["task_id"]] = task
                self.jg_control.add_build(group, task)
                count += 1

        else:
            self.log.info(
                "Task missing field `task_id`, raw task: {}".format(task))
        return count

    def process_action(self, action):
        """
        Run action task handler, see :py:class:`~backend.action.Action`

        :param action: dict-like object with action task
        """
        ao = Action(self.opts, action, frontend_client=self.frontend_client)
        ao.run()

    def load_tasks(self):
        """
        Retrieve tasks from frontend and runs appropriate handlers
        """
        try:
            r = get("{0}/backend/waiting/".format(self.opts.frontend_base_url),
                    auth=("user", self.opts.frontend_auth))
        except RequestException as e:
            self.log.exception("Error retrieving jobs from {}: {}".format(
                self.opts.frontend_base_url, e))
            return

        try:
            r_json = r.json()
        except ValueError as e:
            self.log.exception(
                "Error getting JSON build list from FE {0}".format(e))
            return

        if r_json.get("builds"):
            self.log.debug("{0} jobs returned".format(len(r_json["builds"])))
            count = 0
            for task in r_json["builds"]:
                try:
                    count += self.route_build_task(task)
                except CoprJobGrabError as err:
                    self.log.exception(
                        "Failed to enqueue new job: {} with error: {}".format(
                            task, err))

            if count:
                self.log.info("New build jobs: %s" % count)

        if r_json.get("actions"):
            count = 0
            self.log.info("{0} actions returned".format(len(
                r_json["actions"])))

            for action in r_json["actions"]:
                start = time.time()
                try:
                    self.process_action(action)
                except Exception as error:
                    self.log.exception(
                        "Error during processing action `{}`: {}".format(
                            action, error))
                if time.time() - start > 2 * self.opts.sleeptime:
                    # we are processing actions for too long, stop and fetch everything again (including new builds)
                    break

    def on_pubsub_event(self, raw):
        # from celery.contrib import rdb; rdb.set_trace()
        if raw is None:
            return
        if "type" not in raw or raw["type"] != "message":
            self.log.warn(
                "Missing type or wrong type in pubsub msg: {}, ignored".format(
                    raw))
            return
        try:
            msg = json.loads(raw["data"])
            # msg: {"action": ("remove"|"reschedule"), "task_id": ..., "build_id"..., "chroot": ...}
            # Actions: "remove" simply remove `task_id` from self.added_job
            #          "reschedule" additionally call frontend and set pending state before removal
            if "action" not in msg:
                self.log.warn(
                    "Missing required field `action`, msg ignored: {}".format(
                        msg))
                return
            action = msg["action"]
            if action not in ["remove", "reschedule"]:
                self.log.warn(
                    "Action `{}` not allowed, msg ignored: {} ".format(
                        action, msg))
                return

            if "task_id" not in msg:
                self.log.warn(
                    "Missing required field `task_id`, msg ignored: {}".format(
                        msg))
                return

            task_id = msg["task_id"]
            if action == "reschedule" and "build_id" in msg and "chroot" in msg:
                # TODO: dirty dependency to frontend, Job management should be re-done (
                self.log.info("Rescheduling task `{}`".format(task_id))
                self.frontend_client.reschedule_build(msg["build_id"],
                                                      msg["chroot"])

            if task_id not in self.added_jobs_dict:
                self.log.debug(
                    "Task `{}` not present in added jobs,  msg ignored: {}".
                    format(task_id, msg))
                return

            if action in ["remove", "reschedule"]:
                self.added_jobs_dict.pop(task_id)
                self.log.info(
                    "Removed task `{}` from added_jobs".format(task_id))

        except Exception as err:
            self.log.exception(
                "Error receiving message from remove pubsub: raw msg: {}, error: {}"
                .format(raw, err))

    def log_queue_info(self):
        if self.added_jobs_dict:
            self.log.debug("Added jobs after remove and load: {}".format(
                self.added_jobs_dict))
            self.log.debug("# of executed jobs: {}".format(
                len(self.added_jobs_dict)))

    def init_internal_structures(self):
        self.arch_to_group_id_map = dict()
        self.group_to_usermax = dict()
        for group in self.opts.build_groups:
            group_id = group["id"]
            for arch in group["archs"]:
                self.arch_to_group_id_map[arch] = group_id
                self.log.debug("mapping {0} to {1} group".format(
                    arch, group_id))

            self.log.debug("user might use only {0}VMs for {1} group".format(
                group["max_vm_per_user"], group_id))
            self.group_to_usermax[group_id] = group["max_vm_per_user"]

        self.added_jobs_dict = dict()

    def handle_control_channel(self):
        if not self.jg_control.backend_started():
            return
        self.log.info("backend gave us signal to start")
        self.init_internal_structures()
        self.jg_control.remove_all_builds()
        self.jg_control.job_graber_initialized()

    def run(self):
        """
        Starts job grabber process
        """
        setproctitle("CoprJobGrab")
        self.listen_to_pubsub()

        self.log.info("JobGrub started.")

        self.init_internal_structures()
        try:
            while True:
                try:
                    # This effectively delays job_grabbing until backend
                    # gives as signal to start.
                    self.handle_control_channel()
                    self.load_tasks()
                    self.log_queue_info()
                    time.sleep(self.opts.sleeptime)
                except Exception as err:
                    self.log.exception(
                        "Job Grab unhandled exception: {}".format(err))

        except KeyboardInterrupt:
            return

    def terminate(self):
        if self.ps_thread:
            self.ps_thread.stop()
            self.ps_thread.join()
        super(CoprJobGrab, self).terminate()
Example #22
0
def main():
    opts = get_backend_opts()
    fc = FrontendClient(opts)
    grabber = CoprJobGrab(opts, frontend_client=fc)
    grabber.run()
Example #23
0
class CoprBackend(object):
    """
    Core process - starts/stops/initializes workers


    :param config_file: path to the backend configuration file
    :param ext_opts: additional options for backend
    """
    def __init__(self, config_file=None, ext_opts=None):
        # read in config file
        # put all the config items into a single self.opts munch

        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to stow our cli options for read_conf()
        self.workers_by_group_id = defaultdict(list)
        self.max_worker_num_by_group_id = defaultdict(int)

        self.config_reader = BackendConfigReader(self.config_file,
                                                 self.ext_opts)
        self.opts = None
        self.update_conf()

        self.task_queues = {}

        self.frontend_client = FrontendClient(self.opts)
        self.is_running = False

        self.log = get_redis_logger(self.opts, "backend.main", "backend")

    def clean_task_queues(self):
        """
        Make sure there is nothing in our task queues
        """
        try:
            for queue in self.task_queues.values():
                while queue.length:
                    queue.dequeue()
        except ConnectionError:
            raise CoprBackendError(
                "Could not connect to a task queue. Is Redis running?")

    def init_task_queues(self):
        """
        Connect to the retask.Queue for each group_id. Remove old tasks from queues.
        """
        try:
            for group in self.opts.build_groups:
                group_id = group["id"]
                queue = Queue("copr-be-{0}".format(group_id))
                queue.connect()
                self.task_queues[group_id] = queue
        except ConnectionError:
            raise CoprBackendError(
                "Could not connect to a task queue. Is Redis running?")

        self.clean_task_queues()

    def update_conf(self):
        """
        Update backend config from config file
        """
        self.opts = self.config_reader.read()

    def spin_up_workers_by_group(self, group):
        """
        Handles starting/growing the number of workers

        :param dict group: Builders group

        Utilized keys:
            - **id**
            - **max_workers**

        """
        group_id = group["id"]

        if len(self.workers_by_group_id[group_id]) < group["max_workers"]:
            self.log.info("Spinning up more workers")
            for _ in range(group["max_workers"] -
                           len(self.workers_by_group_id[group_id])):
                self.max_worker_num_by_group_id[group_id] += 1
                try:
                    w = Worker(
                        opts=self.opts,
                        frontend_client=self.frontend_client,
                        worker_num=self.max_worker_num_by_group_id[group_id],
                        group_id=group_id)

                    self.workers_by_group_id[group_id].append(w)
                    w.start()
                    time.sleep(0.3)
                    self.log.info("Started worker: {} for group: {}".format(
                        w.worker_num, group_id))
                except Exception as error:
                    self.log.exception(
                        "Failed to start new Worker: {}".format(error))

            self.log.info("Finished starting worker processes")

    def prune_dead_workers_by_group_id(self, group_id):
        """ Removes dead workers from the pool

        :return list: alive workers

        :raises:
            :py:class:`~backend.exceptions.CoprBackendError` when got dead worker and
                option "exit_on_worker" is enabled
        """
        preserved_workers = []
        for w in self.workers_by_group_id[group_id]:
            if not w.is_alive():
                self.log.warn("Worker {} died unexpectedly".format(
                    w.worker_num))
                w.terminate()  # kill it with a fire
                if self.opts.exit_on_worker:
                    raise CoprBackendError("Worker died unexpectedly, exiting")
            else:
                preserved_workers.append(w)
        return preserved_workers

    def terminate(self):
        """
        Cleanup backend processes (just workers for now)
        And also clean all task queues as they would survive copr restart
        """

        self.is_running = False
        for group in self.opts.build_groups:
            group_id = group["id"]
            for w in self.workers_by_group_id[group_id][:]:
                self.workers_by_group_id[group_id].remove(w)
                w.terminate_instance()
        self.clean_task_queues()

        try:
            self.log.info("Rescheduling unfinished builds before stop")
            self.frontend_client.reschedule_all_running()
        except RequestException as err:
            self.log.exception(err)
            return

    def run(self):
        """
        Starts backend process. Control sub process start/stop.
        """
        self.update_conf()
        self.init_task_queues()
        time.sleep(1)
        self.log.info("Initial config: {}".format(self.opts))

        try:
            self.log.info("Rescheduling old unfinished builds")
            self.frontend_client.reschedule_all_running()
        except RequestException as err:
            self.log.exception(err)
            return

        self.is_running = True
        while self.is_running:
            # re-read config into opts
            self.update_conf()

            for group in self.opts.build_groups:
                group_id = group["id"]

                self.spin_up_workers_by_group(group)
                # FIXME - prune out workers
                # if len(self.workers) > self.opts.num_workers:
                #    killnum = len(self.workers) - self.opts.num_workers
                #    for w in self.workers[:killnum]:
                # insert a poison pill? Kill after something? I dunno.
                # FIXME - if a worker bombs out - we need to check them
                # and startup a new one if it happens
                # check for dead workers and abort
                preserved_workers = self.prune_dead_workers_by_group_id(
                    group_id)
                self.workers_by_group_id[group_id] = preserved_workers

            time.sleep(self.opts.sleeptime)
Example #24
0
class ActionDispatcher(multiprocessing.Process):
    """
    1) Fetch action task from frontend
    2) Run it synchronously
    3) Go to 1)
    """

    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="action-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.action_dispatcher", "action_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)

    def update_process_title(self, msg=None):
        proc_title = "Action dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def get_frontend_actions(self):
        """
        Get unfiltered list of actions from frontend, both running and pending.
        """

        try:
            raw_actions = self.frontend_client.get('pending-actions').json()
        except (FrontendClientException, ValueError) as error:
            self.log.exception(
                "Retrieving an action tasks failed with error: %s",
                error)
            return []

        return [ActionQueueTask(action['id']) for action in raw_actions]


    def run(self):
        """
        Executes action dispatching process.
        """
        self.log.info("Action dispatching started.")
        self.update_process_title()

        redis = get_redis_connection(self.opts)
        worker_manager = ActionWorkerManager(
            redis_connection=redis,
            log=self.log,
            max_workers=self.opts.actions_max_workers)
        worker_manager.frontend_client = FrontendClient(self.opts, self.log)

        timeout = self.opts.sleeptime

        while True:
            self.log.info("getting actions from frontend")
            start = time.time()
            for task in self.get_frontend_actions():
                worker_manager.add_task(task)

            # Execute the actions.
            worker_manager.run(timeout=timeout)

            sleep_more = timeout - (time.time() - start)
            if sleep_more > 0:
                time.sleep(sleep_more)
Example #25
0
class BuildDispatcher(multiprocessing.Process):
    """
    1) Fetch build task from frontend
    2) Get a free VM for it
    3) Create a worker for the job
    4) Start it asynchronously and go to 1)
    """
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher",
                                    "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)

        # Maps e.g. x86_64 && i386 => PC
        self.arch_to_group = dict()
        # PC => max N builders per user
        self.group_to_usermax = dict()

        self.init_internal_structures()

    def get_vm_group_id(self, arch):
        try:
            return self.arch_to_group[arch]
        except KeyError:
            raise DispatchBuildError("Unknown architecture {0}".format(arch))

    def update_process_title(self, msg=None):
        proc_title = "Build dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def init_internal_structures(self):
        self.arch_to_group = dict()
        self.group_to_usermax = dict()
        for group in self.opts.build_groups:
            group_id = group["id"]
            for arch in group["archs"]:
                self.arch_to_group[arch] = group_id
                self.log.debug("mapping {0} to {1} group".format(
                    arch, group_id))

            self.log.debug("user might use only {0}VMs for {1} group".format(
                group["max_vm_per_user"], group_id))
            self.group_to_usermax[group_id] = group["max_vm_per_user"]

    def load_job(self):
        """
        Retrieve a single build job from frontend.
        """
        self.log.info("Waiting for a job from frontend...")
        get_task_init_time = time.time()

        task = None
        while not task:
            self.update_process_title(
                "Waiting for a job from frontend for {} s".format(
                    int(time.time() - get_task_init_time)))
            try:
                r = get("{0}/backend/waiting/".format(
                    self.opts.frontend_base_url),
                        auth=("user", self.opts.frontend_auth))
                task = r.json().get("build")
            except (RequestException, ValueError) as error:
                self.log.exception(
                    "Retrieving build job from {} failed with error: {}".
                    format(self.opts.frontend_base_url, error))
            finally:
                if not task:
                    time.sleep(self.opts.sleeptime)

        self.log.info("Got new build job {}".format(task['task_id']))
        return BuildJob(task, self.opts)

    def can_build_start(self, job):
        """
        Announce to the frontend that the build is going to start so that
        it can confirm that and draw out another job for building.

        Returns
        -------
        True if the build can start
        False if the build can not start (build is cancelled)
        """
        try:
            can_build_start = self.frontend_client.starting_build(
                job.build_id, job.chroot)
        except (RequestException, ValueError) as error:
            self.log.exception(
                "Communication with Frontend to confirm build start failed with error: {}"
                .format(error))
            return False

        if not can_build_start:
            self.log.exception("Frontend forbade to start the job {}".format(
                job.task_id))

        return can_build_start

    def clean_finished_workers(self, workers):
        for worker in workers:
            if not worker.is_alive():
                worker.join(5)
                workers.remove(worker)
                self.log.info("Removed finished worker {} for job {}".format(
                    worker.worker_id, worker.job.task_id))

    def run(self):
        """
        Executes build dispatching process.
        """
        self.log.info("Build dispatching started.")
        self.update_process_title()

        workers = []
        next_worker_id = 1
        while True:
            self.clean_finished_workers(workers)

            job = self.load_job()

            try:
                self.log.info("Acquiring VM for job {}...".format(str(job)))
                vm_group_id = self.get_vm_group_id(job.arch)
                vm = self.vm_manager.acquire_vm(vm_group_id, job.project_owner,
                                                os.getpid(), job.task_id,
                                                job.build_id, job.chroot)
            except NoVmAvailable as error:
                self.log.info(
                    "No available resources for task {} (Reason: {}). Deferring job."
                    .format(job.task_id, error))
                self.frontend_client.defer_build(job.build_id, job.chroot)
                continue
            else:
                self.log.info("VM {} for job {} successfully acquired".format(
                    vm.vm_name, job.task_id))

            if not self.can_build_start(job):
                self.vm_manager.release_vm(vm.vm_name)
                continue

            worker = Worker(opts=self.opts,
                            frontend_client=self.frontend_client,
                            vm_manager=self.vm_manager,
                            worker_id=next_worker_id,
                            vm=vm,
                            job=job)
            workers.append(worker)
            worker.start()
            self.log.info("Started new worker {} for job {}".format(
                worker.worker_id, worker.job.task_id))
            next_worker_id = (next_worker_id + 1) % 2**15
Example #26
0
class Pruner(object):
    def __init__(self, opts, cmdline_opts=None):
        self.opts = opts
        self.prune_days = getattr(self.opts, "prune_days", DEF_DAYS)
        self.chroots = {}
        self.frontend_client = FrontendClient(self.opts)
        self.mtime_optimization = True
        if cmdline_opts:
            self.mtime_optimization = not cmdline_opts.no_mtime_optimization

    def run(self):
        response = self.frontend_client._post_to_frontend_repeatedly("", "chroots-prunerepo-status")
        self.chroots = json.loads(response.content)

        results_dir = self.opts.destdir
        loginfo("Pruning results dir: {} ".format(results_dir))
        user_dir_names, user_dirs = list_subdir(results_dir)

        loginfo("Going to process total number: {} of user's directories".format(len(user_dir_names)))
        loginfo("Going to process user's directories: {}".format(user_dir_names))

        loginfo("--------------------------------------------")
        for username, subpath in zip(user_dir_names, user_dirs):
            loginfo("For user `{}` exploring path: {}".format(username, subpath))
            for projectdir, project_path in zip(*list_subdir(subpath)):
                loginfo("Exploring projectdir `{}` with path: {}".format(projectdir, project_path))
                self.prune_project(project_path, username, projectdir)
                loginfo("--------------------------------------------")

        loginfo("Setting final_prunerepo_done for deactivated chroots")
        chroots_to_prune = []
        for chroot, active in self.chroots.items():
            if not active:
                chroots_to_prune.append(chroot)
        self.frontend_client._post_to_frontend_repeatedly(chroots_to_prune, "final-prunerepo-done")

        loginfo("--------------------------------------------")
        loginfo("Pruning finished")

    def prune_project(self, project_path, username, projectdir):
        loginfo("Going to prune {}/{}".format(username, projectdir))

        projectname = projectdir.split(':', 1)[0]
        loginfo("projectname = {}".format(projectname))

        try:
            if not get_auto_createrepo_status(self.opts.frontend_base_url, username, projectname):
                loginfo("Skipped {}/{} since auto createrepo option is disabled"
                          .format(username, projectdir))
                return
            if get_persistent_status(self.opts.frontend_base_url, username, projectname):
                loginfo("Skipped {}/{} since the project is persistent"
                          .format(username, projectdir))
                return
            if not get_auto_prune_status(self.opts.frontend_base_url, username, projectname):
                loginfo("Skipped {}/{} since auto-prunning is disabled for the project"
                          .format(username, projectdir))
                return
        except (CoprException, CoprRequestException) as exception:
            logerror("Failed to get project details for {}/{} with error: {}".format(
                username, projectdir, exception))
            return

        for sub_dir_name in os.listdir(project_path):
            chroot_path = os.path.join(project_path, sub_dir_name)

            if sub_dir_name == 'modules':
                continue

            if not os.path.isdir(chroot_path):
                continue

            if sub_dir_name not in self.chroots:
                loginfo("Final pruning already done for chroot {}/{}:{}".format(username, projectdir, sub_dir_name))
                continue

            if self.mtime_optimization:
                # We only ever remove builds that were done at least
                # 'self.prune_days' ago.  And because we run prunerepo _daily_
                # we know that the candidates for removal (if there are such)
                # are removed about a day after "build_time + self.prune_days".
                touched_before = time.time()-os.stat(chroot_path).st_mtime
                touched_before = touched_before/3600/24 # seconds -> days

                # Because it might happen that prunerepo has some problems to
                # successfully go through the directory for some time (bug, user
                # error, I/O problems...) we rather wait 10 more days till we
                # really start to ignore the directory.
                if touched_before > int(self.prune_days) + 10:
                    loginfo("Skipping {} - not changed for {} days".format(
                        sub_dir_name, touched_before))
                    continue

            try:
                cmd = ['prunerepo', '--verbose', '--days', str(self.prune_days), '--nocreaterepo', chroot_path]
                stdout = runcmd(cmd)
                loginfo(stdout)
                createrepo(path=chroot_path, front_url=self.opts.frontend_base_url,
                           username=username, projectname=projectname,
                           override_acr_flag=True)
                clean_copr(chroot_path, self.prune_days, verbose=True)
            except Exception as err:
                logexception(err)
                logerror("Error pruning chroot {}/{}:{}".format(username, projectdir, sub_dir_name))

            loginfo("Pruning done for chroot {}/{}:{}".format(username, projectdir, sub_dir_name))

        loginfo("Pruning finished for projectdir {}/{}".format(username, projectdir))
Example #27
0
class TestFrontendClient(object):

    def setup_method(self, method):
        self.opts = Munch(
            frontend_base_url="http://example.com/",
            frontend_auth="12345678",
        )
        self.fc = FrontendClient(self.opts)

        self.data = {
            "foo": "bar",
            "bar": [1, 3, 5],
        }
        self.url_path = "sub_path"

        self.build_id = 12345
        self.task_id = "12345-fedora-20-x86_64"
        self.chroot_name = "fedora-20-x86_64"

    @pytest.fixture
    def mask_post_to_fe(self):
        self.ptf = MagicMock()
        self.fc._post_to_frontend = self.ptf

    def test_post_to_frontend(self, post_req):
        post_req.return_value.status_code = 200
        self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_not_200(self, post_req):
        post_req.return_value.status_code = 501
        with pytest.raises(RequestException):
            self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_post_error(self, post_req):
        post_req.side_effect = RequestException()
        with pytest.raises(RequestException):
            self.fc._post_to_frontend(self.data, self.url_path)

        assert post_req.called

    def test_post_to_frontend_repeated_first_try_ok(self, mask_post_to_fe, mc_time):
        response = "ok\n"
        self.ptf.return_value = response

        assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response
        assert not mc_time.sleep.called

    def test_post_to_frontend_repeated_second_try_ok(self, mask_post_to_fe, mc_time):
        response = "ok\n"
        self.ptf.side_effect = [
            RequestException(),
            response,
        ]

        assert self.fc._post_to_frontend_repeatedly(self.data, self.url_path) == response
        assert mc_time.sleep.called

    def test_post_to_frontend_repeated_all_attempts_failed(self, mask_post_to_fe, mc_time):
        self.ptf.side_effect = RequestException()

        with pytest.raises(RequestException):
            self.fc._post_to_frontend_repeatedly(self.data, self.url_path)

        assert mc_time.sleep.called

    def test_update(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.update(self.data)
        assert ptfr.call_args == mock.call(self.data, "update")

    def test_starting_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        for val in [True, False]:
            ptfr.return_value.json.return_value = {"can_start": val}

            assert self.fc.starting_build(self.data) == val

    def test_starting_build_err(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr

        with pytest.raises(RequestException):
            self.fc.starting_build(self.data)

    def test_starting_build_err_2(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        ptfr.return_value.json.return_value = {}

        with pytest.raises(RequestException):
            self.fc.starting_build(self.data)

    def test_reschedule_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.reschedule_build(self.build_id, self.task_id, self.chroot_name)
        expected = mock.call({'build_id': self.build_id, 'task_id': self.task_id, 'chroot': self.chroot_name},
                             'reschedule_build_chroot')
        assert ptfr.call_args == expected
Example #28
0
class BuildDispatcher(multiprocessing.Process):
    """
    1) Fetch build tasks from frontend
    2) Loop through them and try to allocate VM for each
       - If VM can be allocated, spawn a worker and run it asynchronously
       - otherwise, check the next build task
    3) Go to 1
    """
    def __init__(self, opts):
        multiprocessing.Process.__init__(self, name="build-dispatcher")

        self.opts = opts
        self.log = get_redis_logger(self.opts, "backend.build_dispatcher",
                                    "build_dispatcher")
        self.frontend_client = FrontendClient(self.opts, self.log)
        self.vm_manager = VmManager(self.opts)
        self.workers = []
        self.next_worker_id = 1

        self.arch_to_groups = defaultdict(list)
        # PC => max N builders per user
        self.group_to_usermax = dict()
        self.job_ids_previous_request = set()
        self.init_internal_structures()

    def get_vm_group_ids(self, arch):
        if not arch:
            return [group["id"] for group in self.opts.build_groups]
        try:
            return self.arch_to_groups[arch]
        except KeyError:
            raise DispatchBuildError("Unknown architecture {0}".format(arch))

    def update_process_title(self, msg=None):
        proc_title = "Build dispatcher"
        if msg:
            proc_title += " - " + msg
        setproctitle(proc_title)

    def init_internal_structures(self):
        for group in self.opts.build_groups:
            group_id = group["id"]

            for arch in group["archs"]:
                self.arch_to_groups[arch].append(group_id)
                self.log.debug("mapping %s to %s group", arch, group_id)

            self.log.debug("user might use only %sVMs for %s group",
                           group["max_vm_per_user"], group_id)
            self.group_to_usermax[group_id] = group["max_vm_per_user"]

    def load_jobs(self):
        """
        Retrieve a single build job from frontend.
        """
        self.log.info("Waiting for a job from frontend...")
        get_task_init_time = time.time()
        tasks = None

        while not tasks:
            self.update_process_title(
                "Waiting for jobs from frontend for {} s".format(
                    int(time.time() - get_task_init_time)))
            try:
                tasks = self.frontend_client.get('pending-jobs').json()
            except (FrontendClientException, ValueError) as error:
                self.log.exception(
                    "Retrieving build jobs from %s failed with error: %s",
                    self.opts.frontend_base_url, error)
            finally:
                if not tasks:
                    time.sleep(self.opts.sleeptime)

        job_ids = {task.get("task_id") for task in tasks if task}
        new_ids = job_ids - self.job_ids_previous_request
        if new_ids:
            self.log.info("Got new build jobs: %s", new_ids)
        self.job_ids_previous_request = job_ids

        return [BuildJob(task, self.opts) for task in tasks if task]

    def can_build_start(self, job):
        """
        Announce to the frontend that the build is starting. Frontend
        may reject build to start.

        Returns
        -------
        True if the build can start
        False if the build can not start (build is cancelled)
        """
        try:
            job.started_on = time.time()
            job.status = BuildStatus.STARTING
            can_build_start = self.frontend_client.starting_build(
                job.to_dict())
        except (FrontendClientException, ValueError) as error:
            self.log.exception(
                "Communication with Frontend to confirm build start failed with error: %s",
                error)
            return False

        if not can_build_start:
            self.log.exception("Frontend forbade to start the job %s",
                               job.task_id)

        return can_build_start

    def clean_finished_workers(self):
        for worker in self.workers:
            if not worker.is_alive():
                worker.join(5)
                self.workers.remove(worker)
                self.log.info("Removed finished worker %s for job %s",
                              worker.worker_id, worker.job.task_id)

    def start_worker(self, vm, job, reattach=False):
        worker = Worker(opts=self.opts,
                        vm_manager=self.vm_manager,
                        worker_id=self.next_worker_id,
                        vm=vm,
                        job=job,
                        reattach=reattach)
        self.workers.append(worker)
        self.next_worker_id = (self.next_worker_id + 1) % 2**15

        worker.start()
        return worker

    def run(self):
        """
        Executes build dispatching process.
        """
        self.log.info("Build dispatching started.")
        self.update_process_title()

        first_backend_loop = True

        while True:
            self.clean_finished_workers()

            skip_jobs_cache = {}

            for job in self.load_jobs():
                # first check if we do not have
                # worker already running for the job
                if any([job.task_id == w.job.task_id for w in self.workers]):
                    self.log.debug("Skipping already running task '%s'",
                                   job.task_id)
                    continue

                if first_backend_loop:
                    # Server was restarted.  Some builds might be running on
                    # background on builders;  so search db builder records for
                    # the job and if we found it, spawn a worker to reattach.
                    vm = self.vm_manager.get_vm_by_task_id(job.task_id)
                    if vm and vm.state == 'in_use':
                        self.log.info("Reattaching to VM: " + str(vm))
                        worker = self.start_worker(vm, job, reattach=True)
                        vm.store_field(self.vm_manager.rc, "used_by_worker",
                                       worker.worker_id)
                        self.log.info("Reattached new worker %s for job %s",
                                      worker.worker_id, worker.job.task_id)
                        continue

                cache_entry = '{owner}-{arch}-{sandbox}'.format(
                    owner=job.project_owner,
                    arch=job.arch or "noarch",
                    sandbox=job.sandbox,
                )

                if cache_entry in skip_jobs_cache:
                    self.log.debug("Skipped job %s, cached", job)
                    continue

                # ... and if the task is new to us,
                # allocate new vm and run full build
                try:
                    vm_group_ids = self.get_vm_group_ids(job.arch)
                    self.log.debug("Picking VM from groups %s for job %s",
                                   vm_group_ids, job)
                    vm = self.vm_manager.acquire_vm(vm_group_ids,
                                                    job.project_owner,
                                                    job.sandbox,
                                                    self.next_worker_id,
                                                    job.task_id, job.build_id,
                                                    job.chroot)
                except NoVmAvailable as error:
                    skip_jobs_cache[cache_entry] = True
                    self.log.debug(
                        "No available resources for task %s (Reason: %s). Deferring job.",
                        job.task_id, error)
                    continue
                else:
                    self.log.info("VM %s for job %s successfully acquired",
                                  vm.vm_name, job.task_id)

                if not self.can_build_start(job):
                    self.vm_manager.release_vm(vm.vm_name)
                    continue

                worker = self.start_worker(vm, job)
                self.log.info("Started new worker %s for job %s",
                              worker.worker_id, worker.job.task_id)

            first_backend_loop = False
            time.sleep(self.opts.sleeptime)
Example #29
0
class TestFrontendClient(object):
    def setup_method(self, method):
        self.opts = Munch(
            frontend_base_url="http://example.com/",
            frontend_auth="12345678",
        )
        self.fc = FrontendClient(self.opts)

        self.data = {
            "foo": "bar",
            "bar": [1, 3, 5],
        }
        self.url_path = "sub_path"

        self.build_id = 12345
        self.task_id = "12345-fedora-20-x86_64"
        self.chroot_name = "fedora-20-x86_64"

    @pytest.fixture
    def mask_frontend_request(self):
        self.f_r = MagicMock()
        self.fc._frontend_request = self.f_r

    def test_post_to_frontend(self, f_request_method):
        name, method = f_request_method
        method.return_value.status_code = 200
        self.fc._frontend_request(self.url_path, self.data, method=name)
        assert method.called

    def test_post_to_frontend_wrappers(self, f_request_method):
        name, method = f_request_method
        method.return_value.status_code = 200

        call = getattr(self.fc, name)
        if name == 'get':
            call(self.url_path)
        else:
            call(self.url_path, self.data)

        assert method.called

    def test_post_to_frontend_not_200(self, post_req):
        post_req.return_value.status_code = 501
        with pytest.raises(FrontendClientRetryError):
            self.fc._frontend_request(self.url_path, self.data)

        assert post_req.called

    def test_post_to_frontend_post_error(self, post_req):
        post_req.side_effect = RequestException()
        with pytest.raises(FrontendClientRetryError):
            self.fc._frontend_request(self.url_path, self.data)

        assert post_req.called

    def test_post_to_frontend_repeated_first_try_ok(self,
                                                    mask_frontend_request,
                                                    mc_time):
        response = "ok\n"
        self.f_r.return_value = response
        mc_time.time.return_value = 0

        assert self.fc._post_to_frontend_repeatedly(self.data,
                                                    self.url_path) == response
        assert not mc_time.sleep.called

    def test_post_to_frontend_repeated_second_try_ok(self, f_request_method,
                                                     mask_frontend_request,
                                                     mc_time):
        method_name, method = f_request_method

        response = "ok\n"
        self.f_r.side_effect = [
            FrontendClientRetryError(),
            response,
        ]
        mc_time.time.return_value = 0
        assert self.fc._frontend_request_repeatedly(
            self.url_path, data=self.data, method=method_name) == response
        assert mc_time.sleep.called

    def test_post_to_frontend_err_400(self, post_req, mc_time):
        response = Response()
        response.status_code = 404
        response.reason = 'NOT FOUND'

        post_req.side_effect = [
            FrontendClientRetryError(),
            response,
        ]

        mc_time.time.return_value = 0
        with pytest.raises(FrontendClientException):
            assert self.fc._post_to_frontend_repeatedly(
                self.data, self.url_path) == response
        assert mc_time.sleep.called

    @mock.patch('backend.frontend.BACKEND_TIMEOUT', 100)
    def test_post_to_frontend_repeated_all_attempts_failed(
            self, mask_frontend_request, caplog, mc_time):
        mc_time.time.side_effect = [
            0, 0, 5, 5 + 10, 5 + 10 + 15, 5 + 10 + 15 + 20, 1000
        ]
        self.f_r.side_effect = FrontendClientRetryError()
        with pytest.raises(FrontendClientException):
            self.fc._post_to_frontend_repeatedly(self.data, self.url_path)
        assert mc_time.sleep.call_args_list == [
            mock.call(x) for x in [5, 10, 15, 20, 25]
        ]
        assert len(caplog.records) == 5

    def test_post_to_frontend_repeated_indefinitely(self,
                                                    mask_frontend_request,
                                                    caplog, mc_time):
        mc_time.time.return_value = 1
        self.fc.try_indefinitely = True
        self.f_r.side_effect = [FrontendClientRetryError() for _ in range(100)] \
                             + [FrontendClientException()] # e.g. 501 eventually
        with pytest.raises(FrontendClientException):
            self.fc._post_to_frontend_repeatedly(self.data, self.url_path)
        assert mc_time.sleep.called
        assert len(caplog.records) == 100

    def test_reschedule_300(self, mask_frontend_request, post_req):
        response = Response()
        response.status_code = 302
        response.reason = 'whatever'
        post_req.side_effect = response
        with pytest.raises(FrontendClientException) as ex:
            self.fc.reschedule_all_running()
        assert 'Failed to reschedule builds' in str(ex)

    def test_update(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.update(self.data)
        assert ptfr.call_args == mock.call(self.data, "update")

    def test_starting_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        for val in [True, False]:
            ptfr.return_value.json.return_value = {"can_start": val}

            assert self.fc.starting_build(self.data) == val

    def test_starting_build_err(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr

        with pytest.raises(FrontendClientException):
            self.fc.starting_build(self.data)

    def test_starting_build_err_2(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        ptfr.return_value.json.return_value = {}

        with pytest.raises(FrontendClientException):
            self.fc.starting_build(self.data)

    def test_reschedule_build(self):
        ptfr = MagicMock()
        self.fc._post_to_frontend_repeatedly = ptfr
        self.fc.reschedule_build(self.build_id, self.task_id, self.chroot_name)
        expected = mock.call(
            {
                'build_id': self.build_id,
                'task_id': self.task_id,
                'chroot': self.chroot_name
            }, 'reschedule_build_chroot')
        assert ptfr.call_args == expected
Example #30
0
class CoprBackend(object):

    """
    Core process - starts/stops/initializes workers


    :param config_file: path to the backend configuration file
    :param ext_opts: additional options for backend
    """

    def __init__(self, config_file=None, ext_opts=None):
        # read in config file
        # put all the config items into a single self.opts munch

        if not config_file:
            raise CoprBackendError("Must specify config_file")

        self.config_file = config_file
        self.ext_opts = ext_opts  # to stow our cli options for read_conf()
        self.workers_by_group_id = defaultdict(list)
        self.max_worker_num_by_group_id = defaultdict(int)

        self.config_reader = BackendConfigReader(self.config_file, self.ext_opts)
        self.opts = None
        self.update_conf()

        self.task_queues = {}

        self.frontend_client = FrontendClient(self.opts)
        self.is_running = False

        self.log = get_redis_logger(self.opts, "backend.main", "backend")

    def clean_task_queues(self):
        """
        Make sure there is nothing in our task queues
        """
        try:
            for queue in self.task_queues.values():
                while queue.length:
                    queue.dequeue()
        except ConnectionError:
            raise CoprBackendError(
                "Could not connect to a task queue. Is Redis running?")

    def init_task_queues(self):
        """
        Connect to the retask.Queue for each group_id. Remove old tasks from queues.
        """
        try:
            for group in self.opts.build_groups:
                group_id = group["id"]
                queue = Queue("copr-be-{0}".format(group_id))
                queue.connect()
                self.task_queues[group_id] = queue
        except ConnectionError:
            raise CoprBackendError(
                "Could not connect to a task queue. Is Redis running?")

        self.clean_task_queues()

    def update_conf(self):
        """
        Update backend config from config file
        """
        self.opts = self.config_reader.read()

    def spin_up_workers_by_group(self, group):
        """
        Handles starting/growing the number of workers

        :param dict group: Builders group

        Utilized keys:
            - **id**
            - **max_workers**

        """
        group_id = group["id"]

        if len(self.workers_by_group_id[group_id]) < group["max_workers"]:
            self.log.info("Spinning up more workers")
            for _ in range(group["max_workers"] - len(self.workers_by_group_id[group_id])):
                self.max_worker_num_by_group_id[group_id] += 1
                try:
                    w = Worker(
                        opts=self.opts,
                        frontend_client=self.frontend_client,
                        worker_num=self.max_worker_num_by_group_id[group_id],
                        group_id=group_id
                    )

                    self.workers_by_group_id[group_id].append(w)
                    w.start()
                    time.sleep(0.3)
                    self.log.info("Started worker: {} for group: {}".format(w.worker_num, group_id))
                except Exception as error:
                    self.log.exception("Failed to start new Worker: {}".format(error))

            self.log.info("Finished starting worker processes")

    def prune_dead_workers_by_group_id(self, group_id):
        """ Removes dead workers from the pool

        :return list: alive workers

        :raises:
            :py:class:`~backend.exceptions.CoprBackendError` when got dead worker and
                option "exit_on_worker" is enabled
        """
        preserved_workers = []
        for w in self.workers_by_group_id[group_id]:
            if not w.is_alive():
                self.log.warn("Worker {} died unexpectedly".format(w.worker_num))
                w.terminate()  # kill it with a fire
                if self.opts.exit_on_worker:
                    raise CoprBackendError(
                        "Worker died unexpectedly, exiting")
            else:
                preserved_workers.append(w)
        return preserved_workers

    def terminate(self):
        """
        Cleanup backend processes (just workers for now)
        And also clean all task queues as they would survive copr restart
        """

        self.is_running = False
        for group in self.opts.build_groups:
            group_id = group["id"]
            for w in self.workers_by_group_id[group_id][:]:
                self.workers_by_group_id[group_id].remove(w)
                w.terminate_instance()
        self.clean_task_queues()

        try:
            self.log.info("Rescheduling unfinished builds before stop")
            self.frontend_client.reschedule_all_running()
        except RequestException as err:
            self.log.exception(err)
            return

    def run(self):
        """
        Starts backend process. Control sub process start/stop.
        """
        self.update_conf()
        self.init_task_queues()
        time.sleep(1)
        self.log.info("Initial config: {}".format(self.opts))

        try:
            self.log.info("Rescheduling old unfinished builds")
            self.frontend_client.reschedule_all_running()
        except RequestException as err:
            self.log.exception(err)
            return

        self.is_running = True
        while self.is_running:
            # re-read config into opts
            self.update_conf()

            for group in self.opts.build_groups:
                group_id = group["id"]

                self.spin_up_workers_by_group(group)
                # FIXME - prune out workers
                # if len(self.workers) > self.opts.num_workers:
                #    killnum = len(self.workers) - self.opts.num_workers
                #    for w in self.workers[:killnum]:
                # insert a poison pill? Kill after something? I dunno.
                # FIXME - if a worker bombs out - we need to check them
                # and startup a new one if it happens
                # check for dead workers and abort
                preserved_workers = self.prune_dead_workers_by_group_id(group_id)
                self.workers_by_group_id[group_id] = preserved_workers

            time.sleep(self.opts.sleeptime)