class Worker(object): """ Represents the worker process. Waits for tasks to come in from the webapp and then acts on them. """ def __init__(self): self.queue = Queue('commits') self.queue.connect() # TODO -- set both of these with the config file. # Use pyramid tools to load config. self.sleep_interval = 1 self.scratch_dir = "/home/threebean/scratch/pep8bot-scratch" try: os.makedirs(self.scratch_dir) except OSError: pass # Assume that the scratch_dir already exists. def run(self): while True: time.sleep(self.sleep_interval) print "Waking" if self.queue.length == 0: continue task = self.queue.dequeue() data = task.data url = data['repository']['url'] # TODO -- don't clone this url. But fork and clone our url. name = data['repository']['name'] owner = data['repository']['owner']['name'] self.working_dir = tempfile.mkdtemp( prefix=owner + '-' + name, dir=self.scratch_dir, ) print "** Cloning to", self.working_dir print sh.git.clone(url, self.working_dir) print "** Processing files." for root, dirs, files in os.walk(self.working_dir): if '.git' in root: continue for filename in files: if filename.endswith(".py"): infile = root + "/" + filename print "** Tidying", infile tmpfile = infile + ".bak" script = os.path.expanduser( "~/devel/PythonTidy/PythonTidy.py" ) sh.python(script, infile, tmpfile) shutil.move(tmpfile, infile) with directory(self.working_dir): print sh.pwd() print sh.git.status()
def monitor_buildqueue(): """ This function monitors the build queue. If the build is still on then it puts it back to the queue. If the build is finished then it goes to the job queue. """ key = get_key('darkbuildqueue') config = get_redis_config() jobqueue = Queue('jobqueue', config) jobqueue.connect() buildqueue = Queue('buildqueue', config) buildqueue.connect() rdb = redis_connection() if not rdb: log(key, 'redis is missing', 'error') return None rdb.set('darkbuildqueue-status', '1') while True: if check_shutdown(): break try: time.sleep(60) length = buildqueue.length if length == 0: log(key, "Sleeping, no buildqueue job", 'info') time.sleep(60) continue task = buildqueue.dequeue() kojiurl = task.data['kojiurl'] idx = task.data['jobid'] kc = koji.ClientSession(kojiurl, {'debug': False, 'password': None,\ 'debug_xmlrpc': False, 'user': None}) res = kc.getBuild(idx) if not res: #We reached to the new build yet to start #Time to sleep log(key, "build deleted %s" % idx, 'error') continue if res['state'] == 1: #completed build now push to our redis queue jobqueue.enqueue(task) log(key, "in job queue %s" % idx, 'info') continue if res['state'] == 0: #building state buildqueue.enqueue(task) log(key, "in build queue %s" % idx, 'info') continue except Exception, error: log(key, str(error), 'error')
if __name__ == '__main__': libimporter.loadconfig() create_rundir() key = 'darkjobworker' config = get_redis_config() jobqueue = Queue('jobqueue', config) jobqueue.connect() log_status('darkjobworker', 'Starting worker module') while True: if jobqueue.length == 0: log(key, "Sleeping, no jobqueue job", 'info') time.sleep(60) continue try: task = jobqueue.dequeue() if not task: continue instance = task.data['instance'] idx = task.data['build_id'] distro = task.data['release'] utils.msgtext = task.data['instance'] log(key, "Import started %s" % idx, 'info') do_buildid_import(instance, idx, distro, key) log(key, "Import finished %s" % idx, 'info') except Exception, err: log(key, str(err), 'error') print "one more done or crashed" remove_redis_keys('darkjobworker')
class Channel(object): """ Abstraction above retask (the set of "channels" between backend(s), jobgrabber and workers). We could use multiple backends and/or diffferent "atomic" medium (other implemntation than Queue) in future. But make sure nobody needs to touch the "medium" directly. """ def __init__(self, opts, log=None): self.log = log self.opts = opts # channel for Backend <--> JobGrabber communication self.jg_start = Queue("jg_control_start") # channel for JobGrabber <--> [[Builders]] communication self.build_queues = dict() while not self.jg_start.connect(): wait_log(self.log, "waiting for redis", 5) def _get_queue(self, bgroup): if not bgroup in self.build_queues: q_id = "copr-be-{0}".format(bgroup) q = Queue(q_id) if not q.connect(): # As we already connected to jg_control_message, this should # be also OK. raise Exception("can't connect to redis, should never happen!") return q return self.build_queues[bgroup] def add_build(self, bgroup, build): """ this should be used by job_grab only for now """ q = self._get_queue(bgroup) try: q.enqueue(Task(build)) except Exception as err: # I've seen isses Task() was not able to jsonify urllib exceptions if not self.log: return False self.log.error("can't enqueue build {0}, reason:\n{1}".format( build, err )) return True # Builder's API def get_build(self, bgroup): """ Return task from queue or return 0 """ q = self._get_queue(bgroup) t = q.dequeue() return t.data if t else None # JobGrab's API def backend_started(self): return self.jg_start.length def job_graber_initialized(self): while self.jg_start.dequeue(): pass def remove_all_builds(self): for bgroup in self.build_queues: q = self._get_queue(bgroup) while q.dequeue(): pass self.build_queues = dict() # Backend's API def backend_start(self): """ Notify jobgrab about service start. """ if not self.jg_start.enqueue(Task("start")): raise Exception("can't append to retask queue, should never happen!") while self.jg_start.length: wait_log(self.log, "waiting until jobgrabber initializes queue")
def runTest(self): queue = Queue('testqueue') queue.connect() task = queue.dequeue() i = task.data self.assertEqual(task.data['name'], u'kushal')
class Worker(multiprocessing.Process): """ Worker process dispatches building tasks. Backend spin-up multiple workers, each worker associated to one group_id and process one task at the each moment. Worker listens for the new tasks from :py:class:`retask.Queueu` associated with its group_id :param Bunch opts: backend config :param queue: (:py:class:`multiprocessing.Queue`) queue to announce new events :param int worker_num: worker number :param int group_id: group_id from the set of groups defined in config :param callback: callback object to handle internal workers events. Should implement method ``log(msg)``. :param lock: (:py:class:`multiprocessing.Lock`) global backend lock """ def __init__(self, opts, events, worker_num, group_id, callback=None, lock=None): # base class initialization multiprocessing.Process.__init__(self, name="worker-builder") self.opts = opts # job management stuff self.task_queue = Queue("copr-be-{0}".format(str(group_id))) self.task_queue.connect() # event queue for communicating back to dispatcher self.events = events self.worker_num = worker_num self.group_id = group_id self.kill_received = False self.lock = lock self.frontend_callback = FrontendClient(opts, events) self.callback = callback if not self.callback: log_name = "worker-{0}-{1}.log".format( self.group_name, self.worker_num) self.logfile = os.path.join(self.opts.worker_logdir, log_name) self.callback = WorkerCallback(logfile=self.logfile) self.vm_name = None self.vm_ip = None self.callback.log("creating worker: dynamic ip") @property def group_name(self): try: return self.opts.build_groups[self.group_id]["name"] except Exception as error: self.callback.log("Failed to get builder group name from config, using group_id as name." "Original error: {}".format(error)) return self.group_id def event(self, topic, template, content=None): """ Multi-purpose logging method. Logs messages to three different destinations: - To log file - The internal "events" queue for communicating back to the dispatcher. - The fedmsg bus. Messages are posted asynchronously to a zmq.PUB socket. """ content = content or {} what = template.format(**content) who = "worker-{0}".format(self.worker_num) self.callback.log("event: who: {0}, what: {1}".format(who, what)) self.events.put({"when": time.time(), "who": who, "what": what}) if self.opts.fedmsg_enabled and fedmsg: content["who"] = who content["what"] = what try: fedmsg.publish(modname="copr", topic=topic, msg=content) # pylint: disable=W0703 except Exception as e: # XXX - Maybe log traceback as well with traceback.format_exc() self.callback.log("failed to publish message: {0}".format(e)) def _announce_start(self, job): """ Announce everywhere that a build process started now. """ job.started_on = time.time() self.mark_started(job) template = "build start: user:{user} copr:{copr}" \ "pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.pkg_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.event("build.start", template, content) template = "chroot start: chroot:{chroot} user:{user}" \ "copr:{copr} pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(chroot=job.chroot, user=job.submitter, owner=job.project_owner, pkg=job.pkg_name, copr=job.project_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.event("chroot.start", template, content) def _announce_end(self, job): """ Announce everywhere that a build process ended now. """ job.ended_on = time.time() self.return_results(job) self.callback.log("worker finished build: {0}".format(self.vm_ip)) template = "build end: user:{user} copr:{copr} build:{build}" \ " pkg: {pkg} version: {version} ip:{ip} pid:{pid} status:{status}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.pkg_name, version=job.pkg_version, build=job.build_id, ip=self.vm_ip, pid=self.pid, status=job.status, chroot=job.chroot) self.event("build.end", template, content) def run_ansible_playbook(self, args, name="running playbook", attempts=9): """ Call ansible playbook: - well mostly we run out of space in OpenStack so we rather try multiple times (attempts param) - dump any attempt failure """ # Ansible playbook python API does not work here, dunno why. See: # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 command = "{0} {1}".format(ansible_playbook, args) result = None for i in range(0, attempts): try: attempt_desc = ": retry: " if i > 0 else ": begin: " self.callback.log(name + attempt_desc + command) result = subprocess.check_output(command, shell=True) self.callback.log("Raw playbook output:\n{0}\n".format(result)) break except CalledProcessError as e: self.callback.log("CalledProcessError: \n{0}\n".format(e.output)) sys.stderr.write("{0}\n".format(e.output)) # FIXME: this is not purpose of opts.sleeptime time.sleep(self.opts.sleeptime) self.callback.log(name + ": end") return result def validate_vm(self): """ Test connectivity to the VM :param ipaddr: ip address to the newly created VM :raises: :py:class:`~backend.exceptions.CoprWorkerSpawnFailError`: validation fails """ # we were getting some dead instances # that's why I'm testing the connectivity here runner_options = dict( remote_user="******", host_list="{},".format(self.vm_ip), pattern=self.vm_ip, forks=1, transport=self.opts.ssh.transport, timeout=500 ) connection = ansible.runner.Runner(**runner_options) connection.module_name = "shell" connection.module_args = "echo hello" try: res = connection.run() except Exception as exception: raise CoprWorkerSpawnFailError( "Failed to check created VM ({})" "due to ansible error: {}".format(self.vm_ip, exception)) if self.vm_ip not in res.get("contacted", {}): self.callback.log( "Worker is not responding to the testing playbook. Terminating it." "Runner options: {}".format(runner_options) + "Ansible raw response:\n{}".format(res)) raise CoprWorkerSpawnFailError("Created VM ({}) was unresponsive " "and therefore terminated".format(self.vm_ip)) def try_spawn(self, args): """ Tries to spawn new vm using ansible :param args: ansible for ansible command which spawns VM :return str: valid ip address of new machine (nobody guarantee machine availability) """ result = self.run_ansible_playbook(args, "spawning instance") if not result: raise CoprWorkerSpawnFailError("No result, trying again") match = re.search(r'IP=([^\{\}"]+)', result, re.MULTILINE) if not match: raise CoprWorkerSpawnFailError("No ip in the result, trying again") ipaddr = match.group(1) match = re.search(r'vm_name=([^\{\}"]+)', result, re.MULTILINE) if match: self.vm_name = match.group(1) self.callback.log("got instance ip: {0}".format(ipaddr)) try: IP(ipaddr) except ValueError: # if we get here we"re in trouble msg = "Invalid IP back from spawn_instance - dumping cache output\n" msg += str(result) raise CoprWorkerSpawnFailError(msg) return ipaddr def spawn_instance(self): """ Spawn new VM, executing the following steps: - call the spawn playbook to startup/provision a building instance - get an IP and test if the builder responds - repeat this until you get an IP of working builder :param BuildJob job: :return ip: of created VM :return None: if couldn't find playbook to spin ip VM """ start = time.time() # Ansible playbook python API does not work here, dunno why. See: # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 try: spawn_playbook = self.opts.build_groups[self.group_id]["spawn_playbook"] except KeyError: return spawn_args = "-c ssh {}".format(spawn_playbook) # TODO: replace with for i in range(MAX_SPAWN_TRIES): ... else raise FatalError i = 0 while self.vm_ip is None: i += 1 try: self.callback.log("Spawning a builder. Try No. {0}".format(i)) self.vm_ip = self.try_spawn(spawn_args) self.update_process_title() try: self.validate_vm() except CoprWorkerSpawnFailError: self.terminate_instance() raise self.callback.log("Instance spawn/provision took {0} sec" .format(time.time() - start)) except CoprWorkerSpawnFailError as exception: self.callback.log("VM Spawn attempt failed with message: {}" .format(exception.msg)) def terminate_instance(self): """ Call the terminate playbook to destroy the building instance """ self.update_process_title(suffix="Terminating VM") term_args = {} if "ip" in self.opts.terminate_vars: term_args["ip"] = self.vm_ip if "vm_name" in self.opts.terminate_vars: term_args["vm_name"] = self.vm_name try: playbook = self.opts.build_groups[self.group_id]["terminate_playbook"] except KeyError: self.callback.log( "Fatal error: no terminate playbook for group_id: {}; exiting" .format(self.group_id)) sys.exit(255) # args = "-c ssh -i '{0},' {1} {2}".format( args = "-c ssh {} {}".format( # self.vm_ip, playbook, ans_extra_vars_encode(term_args, "copr_task")) try: self.run_ansible_playbook(args, "terminate instance") except Exception as error: self.callback.log("Failed to terminate an instance: vm_name={}, vm_ip={}. Original error: {}" .format(self.vm_name, self.vm_ip, error)) # TODO: should we check that machine was destroyed? self.vm_ip = None self.vm_name = None self.update_process_title() def mark_started(self, job): """ Send data about started build to the frontend """ job.status = 3 # running build = job.to_dict() self.callback.log("build: {}".format(build)) data = {"builds": [build]} try: self.frontend_callback.update(data) except: raise CoprWorkerError( "Could not communicate to front end to submit status info") def return_results(self, job): """ Send the build results to the frontend """ self.callback.log( "{0} status {1}. Took {2} seconds".format( job.build_id, job.status, job.ended_on - job.started_on)) self.callback.log("build: {}".format(job.to_dict())) data = {"builds": [job.to_dict()]} try: self.frontend_callback.update(data) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}" .format(err) ) def starting_build(self, job): """ Announce to the frontend that a build is starting. :return True: if the build can start :return False: if the build can not start (build is cancelled) """ try: can_start = self.frontend_callback.starting_build(job.build_id, job.chroot) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}" .format(err) ) return can_start @classmethod def pkg_built_before(cls, pkg, chroot, destdir): """ Check whether the package has already been built in this chroot. """ s_pkg = os.path.basename(pkg) pdn = s_pkg.replace(".src.rpm", "") resdir = "{0}/{1}/{2}".format(destdir, chroot, pdn) resdir = os.path.normpath(resdir) if os.path.exists(resdir) and os.path.exists(os.path.join(resdir, "success")): return True return False def spawn_instance_with_check(self): """ Wrapper around self.spawn_instance() with exception checking :param BuildJob job: :return str: ip of spawned vm :raises: - :py:class:`~backend.exceptions.CoprWorkerError`: spawn function doesn't return ip - :py:class:`AnsibleError`: failure during anible command execution """ self.update_process_title(suffix="Spawning a new VM") try: self.spawn_instance() if not self.vm_ip: # TODO: maybe add specific exception? raise CoprWorkerError( "No IP found from creating instance") except AnsibleError as e: register_build_result(self.opts, failed=True) self.callback.log("failure to setup instance: {0}".format(e)) raise def init_fedmsg(self): """ Initialize Fedmsg (this assumes there are certs and a fedmsg config on disk) """ if not (self.opts.fedmsg_enabled and fedmsg): return try: fedmsg.init(name="relay_inbound", cert_prefix="copr", active=True) except Exception as e: self.callback.log( "failed to initialize fedmsg: {0}".format(e)) def on_pkg_skip(self, job): """ Handle package skip """ self._announce_start(job) self.callback.log( "Skipping: package {0} has been already built before.".format(job.pkg)) job.status = BuildStatus.SKIPPED # skipped self._announce_end(job) def obtain_job(self): """ Retrieves new build task from queue. Checks if the new job can be started and not skipped. """ self.update_process_title(suffix="No task") # this sometimes caused TypeError in random worker # when another one picekd up a task to build # why? try: task = self.task_queue.dequeue() except TypeError: return if not task: return # import ipdb; ipdb.set_trace() job = BuildJob(task.data, self.opts) self.update_process_title(suffix="Task: {} chroot: {}".format(job.build_id, job.chroot)) # Checking whether the build is not cancelled if not self.starting_build(job): return # Checking whether to build or skip if self.pkg_built_before(job.pkg, job.chroot, job.destdir): self.on_pkg_skip(job) return # FIXME # this is our best place to sanity check the job before starting # up any longer process return job def do_job(self, job): """ Executes new job. :param job: :py:class:`~backend.job.BuildJob` """ self._announce_start(job) status = BuildStatus.SUCCEEDED chroot_destdir = os.path.normpath(job.destdir + '/' + job.chroot) # setup our target dir locally if not os.path.exists(chroot_destdir): try: os.makedirs(chroot_destdir) except (OSError, IOError) as e: msg = "Could not make results dir" \ " for job: {0} - {1}".format(chroot_destdir, str(e)) self.callback.log(msg) status = BuildStatus.FAILURE if status == BuildStatus.SUCCEEDED: # FIXME # need a plugin hook or some mechanism to check random # info about the pkgs # this should use ansible to download the pkg on # the remote system # and run a series of checks on the package before we # start the build - most importantly license checks. self.callback.log( "Starting build: id={0} builder={1} timeout={2} destdir={3}" " chroot={4} repos={5}" .format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) self.callback.log("Building pkgs: {0}".format(job.pkg)) chroot_repos = list(job.repos) chroot_repos.append(job.results + job.chroot + '/') chroot_repos.append(job.results + job.chroot + '/devel/') chroot_logfile = "{0}/build-{1}.log".format( chroot_destdir, job.build_id) macros = { "copr_username": job.project_owner, "copr_projectname": job.project_name, "vendor": "Fedora Project COPR ({0}/{1})".format( job.project_owner, job.project_name) } try: mr = MockRemote( builder_host=self.vm_ip, job=job, repos=chroot_repos, macros=macros, opts=self.opts, lock=self.lock, callback=CliLogCallBack(quiet=True, logfn=chroot_logfile), ) mr.check() build_details = mr.build_pkg() job.update(build_details) if self.opts.do_sign: mr.add_pubkey() register_build_result(self.opts) except MockRemoteError as e: # record and break self.callback.log("{0} - {1}".format(self.vm_ip, e)) status = BuildStatus.FAILURE register_build_result(self.opts, failed=True) self.callback.log( "Finished build: id={0} builder={1} timeout={2} destdir={3}" " chroot={4} repos={5}" .format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) job.status = status self._announce_end(job) self.update_process_title(suffix="Task: {} chroot: {} done" .format(job.build_id, job.chroot)) def check_vm_still_alive(self): """ Ensure that if we have vm_ip it is alive. Terminates unresponsive instance. """ if self.vm_ip: # TODO: extract method: check_vm_still_alive try: self.validate_vm() except CoprWorkerSpawnFailError: self.terminate_instance() def update_process_title(self, suffix=None): title = "worker-{} {} ".format(self.group_name, self.worker_num) if self.vm_ip: title += "VM_IP={} ".format(self.vm_ip) if self.vm_name: title += "VM_NAME={} ".format(self.vm_name) if suffix: title += str(suffix) setproctitle(title) def run(self): """ Worker should startup and check if it can function for each job it takes from the jobs queue run opts.setup_playbook to create the instance do the build (mockremote) terminate the instance. """ self.init_fedmsg() while not self.kill_received: self.update_process_title() self.check_vm_still_alive() if self.opts.spawn_in_advance and not self.vm_ip: self.spawn_instance_with_check() job = self.obtain_job() if not job: time.sleep(self.opts.sleeptime) continue if not self.vm_ip: self.spawn_instance_with_check() try: self.do_job(job) except Exception as error: self.callback.log("Unhandled build error: {}".format(error)) finally: # clean up the instance self.terminate_instance()
class Worker(multiprocessing.Process): """ Worker process dispatches building tasks. Backend spin-up multiple workers, each worker associated to one group_id and process one task at the each moment. Worker listens for the new tasks from :py:class:`retask.Queue` associated with its group_id :param Munch opts: backend config :param int worker_num: worker number :param int group_id: group_id from the set of groups defined in config """ def __init__(self, opts, frontend_client, worker_num, group_id): # base class initialization multiprocessing.Process.__init__(self, name="worker-builder") self.opts = opts self.worker_num = worker_num self.group_id = group_id self.log = get_redis_logger(self.opts, self.logger_name, "worker") # job management stuff self.task_queue = Queue("copr-be-{0}".format(str(group_id))) self.task_queue.connect() # event queue for communicating back to dispatcher self.kill_received = False self.frontend_client = frontend_client self.vm_name = None self.vm_ip = None self.rc = None self.vmm = VmManager(self.opts) @property def logger_name(self): return "backend.worker-{}-{}".format(self.group_name, self.worker_num) @property def group_name(self): try: return self.opts.build_groups[self.group_id]["name"] except Exception as error: self.log.exception("Failed to get builder group name from config, using group_id as name." "Original error: {}".format(error)) return str(self.group_id) def fedmsg_notify(self, topic, template, content=None): """ Publish message to fedmsg bus when it is available :param topic: :param template: :param content: """ if self.opts.fedmsg_enabled and fedmsg: who = "worker-{0}".format(self.worker_num) content = content or {} content["who"] = who content["what"] = template.format(**content) try: fedmsg.publish(modname="copr", topic=topic, msg=content) # pylint: disable=W0703 except Exception as e: self.log.exception("failed to publish message: {0}".format(e)) def _announce_start(self, job): """ Announce everywhere that a build process started now. """ job.started_on = time.time() self.mark_started(job) template = "build start: user:{user} copr:{copr}" \ "pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.package_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.fedmsg_notify("build.start", template, content) template = "chroot start: chroot:{chroot} user:{user}" \ "copr:{copr} pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(chroot=job.chroot, user=job.submitter, owner=job.project_owner, pkg=job.package_name, copr=job.project_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.fedmsg_notify("chroot.start", template, content) def _announce_end(self, job): """ Announce everywhere that a build process ended now. """ job.ended_on = time.time() self.return_results(job) self.log.info("worker finished build: {0}".format(self.vm_ip)) template = "build end: user:{user} copr:{copr} build:{build}" \ " pkg: {pkg} version: {version} ip:{ip} pid:{pid} status:{status}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.package_name, version=job.package_version, build=job.build_id, ip=self.vm_ip, pid=self.pid, status=job.status, chroot=job.chroot) self.fedmsg_notify("build.end", template, content) def mark_started(self, job): """ Send data about started build to the frontend """ job.status = BuildStatus.RUNNING build = job.to_dict() self.log.info("starting build: {}".format(build)) data = {"builds": [build]} try: self.frontend_client.update(data) except: raise CoprWorkerError( "Could not communicate to front end to submit status info") def return_results(self, job): """ Send the build results to the frontend """ self.log.info("Build {} finished with status {}. Took {} seconds" .format(job.build_id, job.status, job.ended_on - job.started_on)) data = {"builds": [job.to_dict()]} try: self.frontend_client.update(data) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}" .format(err) ) def starting_build(self, job): """ Announce to the frontend that a build is starting. Checks if we can and/or should start job :return True: if the build can start :return False: if the build can not start (build is cancelled) """ try: return self.frontend_client.starting_build(job.build_id, job.chroot) except Exception as err: msg = "Could not communicate to front end to confirm build start" self.log.exception(msg) raise CoprWorkerError(msg) @classmethod def pkg_built_before(cls, pkg, chroot, destdir): """ Check whether the package has already been built in this chroot. """ s_pkg = os.path.basename(pkg) pdn = s_pkg.replace(".src.rpm", "") resdir = "{0}/{1}/{2}".format(destdir, chroot, pdn) resdir = os.path.normpath(resdir) if os.path.exists(resdir) and os.path.exists(os.path.join(resdir, "success")): return True return False def init_fedmsg(self): """ Initialize Fedmsg (this assumes there are certs and a fedmsg config on disk) """ if not (self.opts.fedmsg_enabled and fedmsg): return try: fedmsg.init(name="relay_inbound", cert_prefix="copr", active=True) except Exception as e: self.log.exception("Failed to initialize fedmsg: {}".format(e)) # TODO: doing skip logic on fronted during @start_build query # def on_pkg_skip(self, job): # """ # Handle package skip # """ # self._announce_start(job) # self.log.info("Skipping: package {} has been already built before.".format(job.pkg)) # job.status = BuildStatus.SKIPPED # self.notify_job_grab_about_task_end(job) # self._announce_end(job) def obtain_job(self): """ Retrieves new build task from queue. Checks if the new job can be started and not skipped. """ # ToDo: remove retask, use redis lua fsm logic similiar to VMM # this sometimes caused TypeError in random worker # when another one picekd up a task to build # why? try: task = self.task_queue.dequeue() except TypeError: return if not task: return job = BuildJob(task.data, self.opts) self.update_process_title(suffix="Task: {} chroot: {}, obtained at {}" .format(job.build_id, job.chroot, str(datetime.now()))) return job def do_job(self, job): """ Executes new job. :param job: :py:class:`~backend.job.BuildJob` """ self._announce_start(job) self.update_process_title(suffix="Task: {} chroot: {} build started" .format(job.build_id, job.chroot)) status = BuildStatus.SUCCEEDED # setup our target dir locally if not os.path.exists(job.chroot_dir): try: os.makedirs(job.chroot_dir) except (OSError, IOError): self.log.exception("Could not make results dir for job: {}" .format(job.chroot_dir)) status = BuildStatus.FAILURE self.clean_result_directory(job) if status == BuildStatus.SUCCEEDED: # FIXME # need a plugin hook or some mechanism to check random # info about the pkgs # this should use ansible to download the pkg on # the remote system # and run a series of checks on the package before we # start the build - most importantly license checks. self.log.info("Starting build: id={} builder={} job: {}" .format(job.build_id, self.vm_ip, job)) with local_file_logger( "{}.builder.mr".format(self.logger_name), job.chroot_log_path, fmt=build_log_format) as build_logger: try: mr = MockRemote( builder_host=self.vm_ip, job=job, logger=build_logger, opts=self.opts ) mr.check() build_details = mr.build_pkg_and_process_results() job.update(build_details) if self.opts.do_sign: mr.add_pubkey() register_build_result(self.opts) except MockRemoteError as e: # record and break self.log.exception( "Error during the build, host={}, build_id={}, chroot={}, error: {}" .format(self.vm_ip, job.build_id, job.chroot, e) ) status = BuildStatus.FAILURE register_build_result(self.opts, failed=True) self.log.info( "Finished build: id={} builder={} timeout={} destdir={}" " chroot={} repos={}" .format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) self.copy_mock_logs(job) job.status = status self._announce_end(job) self.update_process_title(suffix="Task: {} chroot: {} done" .format(job.build_id, job.chroot)) def copy_mock_logs(self, job): if not os.path.isdir(job.results_dir): self.log.info("Job results dir doesn't exists, couldn't copy main log; path: {}" .format(job.results_dir)) return log_names = [(job.chroot_log_name, "mockchain.log.gz"), (job.rsync_log_name, "rsync.log.gz")] for src_name, dst_name in log_names: src = os.path.join(job.chroot_dir, src_name) dst = os.path.join(job.results_dir, dst_name) try: with open(src, "rb") as f_src, gzip.open(dst, "wb") as f_dst: f_dst.writelines(f_src) except IOError: self.log.info("File {} not found".format(src)) def clean_result_directory(self, job): """ Create backup directory and move there results from previous build. """ if not os.path.exists(job.results_dir) or os.listdir(job.results_dir) == []: return backup_dir_name = "prev_build_backup" backup_dir = os.path.join(job.results_dir, backup_dir_name) self.log.info("Cleaning target directory, results from previous build storing in {}" .format(backup_dir)) if not os.path.exists(backup_dir): os.makedirs(backup_dir) files = (x for x in os.listdir(job.results_dir) if x != backup_dir_name) for filename in files: file_path = os.path.join(job.results_dir, filename) if os.path.isfile(file_path): if file_path.endswith((".info", ".log", ".log.gz")): os.rename(file_path, os.path.join(backup_dir, filename)) elif not file_path.endswith(".rpm"): os.remove(file_path) else: shutil.rmtree(file_path) def update_process_title(self, suffix=None): title = "worker-{} {} ".format(self.group_name, self.worker_num) if self.vm_ip: title += "VM_IP={} ".format(self.vm_ip) if self.vm_name: title += "VM_NAME={} ".format(self.vm_name) if suffix: title += str(suffix) setproctitle(title) def notify_job_grab_about_task_end(self, job, do_reschedule=False): # TODO: Current notification method is unreliable, # we should retask and use redis + lua for atomic acquire/release tasks request = { "action": "reschedule" if do_reschedule else "remove", "build_id": job.build_id, "task_id": job.task_id, "chroot": job.chroot, } self.rc.publish(JOB_GRAB_TASK_END_PUBSUB, json.dumps(request)) def acquire_vm_for_job(self, job): # TODO: replace acquire/release with context manager self.log.info("got job: {}, acquiring VM for build".format(str(job))) start_vm_wait_time = time.time() vmd = None while vmd is None: try: self.update_process_title(suffix="trying to acquire VM for job {} for {}s" .format(job.task_id, time.time() - start_vm_wait_time)) vmd = self.vmm.acquire_vm(self.group_id, job.project_owner, os.getpid(), job.task_id, job.build_id, job.chroot) except NoVmAvailable as error: self.log.debug("No VM yet: {}".format(error)) time.sleep(self.opts.sleeptime) continue except Exception as error: self.log.exception("Unhandled exception during VM acquire :{}".format(error)) break return vmd def run_cycle(self): self.update_process_title(suffix="trying to acquire job") time.sleep(self.opts.sleeptime) job = self.obtain_job() if not job: return try: if not self.starting_build(job): self.notify_job_grab_about_task_end(job) return except Exception: self.log.exception("Failed to check if job can be started") self.notify_job_grab_about_task_end(job) return vmd = self.acquire_vm_for_job(job) if vmd is None: self.notify_job_grab_about_task_end(job, do_reschedule=True) else: self.log.info("acquired VM: {} ip: {} for build {}".format(vmd.vm_name, vmd.vm_ip, job.task_id)) # TODO: store self.vmd = vmd and use it self.vm_name = vmd.vm_name self.vm_ip = vmd.vm_ip try: self.do_job(job) self.notify_job_grab_about_task_end(job) except VmError as error: self.log.exception("Builder error, re-scheduling task: {}".format(error)) self.notify_job_grab_about_task_end(job, do_reschedule=True) except Exception as error: self.log.exception("Unhandled build error: {}".format(error)) self.notify_job_grab_about_task_end(job, do_reschedule=True) finally: # clean up the instance self.vmm.release_vm(vmd.vm_name) self.vm_ip = None self.vm_name = None def run(self): self.log.info("Starting worker") self.init_fedmsg() self.vmm.post_init() self.rc = get_redis_connection(self.opts) self.update_process_title(suffix="trying to acquire job") while not self.kill_received: self.run_cycle()
class Worker(multiprocessing.Process): """ Worker process dispatches building tasks. Backend spin-up multiple workers, each worker associated to one group_id and process one task at the each moment. Worker listens for the new tasks from :py:class:`retask.Queue` associated with its group_id :param Munch opts: backend config :param int worker_num: worker number :param int group_id: group_id from the set of groups defined in config """ def __init__(self, opts, frontend_client, worker_num, group_id): # base class initialization multiprocessing.Process.__init__(self, name="worker-builder") self.opts = opts self.worker_num = worker_num self.group_id = group_id self.log = get_redis_logger(self.opts, self.logger_name, "worker") # job management stuff self.task_queue = Queue("copr-be-{0}".format(str(group_id))) self.task_queue.connect() # event queue for communicating back to dispatcher self.kill_received = False self.frontend_client = frontend_client self.vm_name = None self.vm_ip = None self.rc = None self.vmm = VmManager(self.opts) @property def logger_name(self): return "backend.worker-{}-{}".format(self.group_name, self.worker_num) @property def group_name(self): try: return self.opts.build_groups[self.group_id]["name"] except Exception as error: self.log.exception( "Failed to get builder group name from config, using group_id as name." "Original error: {}".format(error)) return str(self.group_id) def fedmsg_notify(self, topic, template, content=None): """ Publish message to fedmsg bus when it is available :param topic: :param template: :param content: """ if self.opts.fedmsg_enabled and fedmsg: who = "worker-{0}".format(self.worker_num) content = content or {} content["who"] = who content["what"] = template.format(**content) try: fedmsg.publish(modname="copr", topic=topic, msg=content) # pylint: disable=W0703 except Exception as e: self.log.exception("failed to publish message: {0}".format(e)) def _announce_start(self, job): """ Announce everywhere that a build process started now. """ job.started_on = time.time() self.mark_started(job) template = "build start: user:{user} copr:{copr}" \ "pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.package_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.fedmsg_notify("build.start", template, content) template = "chroot start: chroot:{chroot} user:{user}" \ "copr:{copr} pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(chroot=job.chroot, user=job.submitter, owner=job.project_owner, pkg=job.package_name, copr=job.project_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.fedmsg_notify("chroot.start", template, content) def _announce_end(self, job): """ Announce everywhere that a build process ended now. """ job.ended_on = time.time() self.return_results(job) self.log.info("worker finished build: {0}".format(self.vm_ip)) template = "build end: user:{user} copr:{copr} build:{build}" \ " pkg: {pkg} version: {version} ip:{ip} pid:{pid} status:{status}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.package_name, version=job.package_version, build=job.build_id, ip=self.vm_ip, pid=self.pid, status=job.status, chroot=job.chroot) self.fedmsg_notify("build.end", template, content) def mark_started(self, job): """ Send data about started build to the frontend """ job.status = BuildStatus.RUNNING build = job.to_dict() self.log.info("starting build: {}".format(build)) data = {"builds": [build]} try: self.frontend_client.update(data) except: raise CoprWorkerError( "Could not communicate to front end to submit status info") def return_results(self, job): """ Send the build results to the frontend """ self.log.info( "Build {} finished with status {}. Took {} seconds".format( job.build_id, job.status, job.ended_on - job.started_on)) data = {"builds": [job.to_dict()]} try: self.frontend_client.update(data) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}". format(err)) def starting_build(self, job): """ Announce to the frontend that a build is starting. Checks if we can and/or should start job :return True: if the build can start :return False: if the build can not start (build is cancelled) """ try: return self.frontend_client.starting_build(job.build_id, job.chroot) except Exception as err: msg = "Could not communicate to front end to confirm build start" self.log.exception(msg) raise CoprWorkerError(msg) @classmethod def pkg_built_before(cls, pkg, chroot, destdir): """ Check whether the package has already been built in this chroot. """ s_pkg = os.path.basename(pkg) pdn = s_pkg.replace(".src.rpm", "") resdir = "{0}/{1}/{2}".format(destdir, chroot, pdn) resdir = os.path.normpath(resdir) if os.path.exists(resdir) and os.path.exists( os.path.join(resdir, "success")): return True return False def init_fedmsg(self): """ Initialize Fedmsg (this assumes there are certs and a fedmsg config on disk) """ if not (self.opts.fedmsg_enabled and fedmsg): return try: fedmsg.init(name="relay_inbound", cert_prefix="copr", active=True) except Exception as e: self.log.exception("Failed to initialize fedmsg: {}".format(e)) # TODO: doing skip logic on fronted during @start_build query # def on_pkg_skip(self, job): # """ # Handle package skip # """ # self._announce_start(job) # self.log.info("Skipping: package {} has been already built before.".format(job.pkg)) # job.status = BuildStatus.SKIPPED # self.notify_job_grab_about_task_end(job) # self._announce_end(job) def obtain_job(self): """ Retrieves new build task from queue. Checks if the new job can be started and not skipped. """ # ToDo: remove retask, use redis lua fsm logic similiar to VMM # this sometimes caused TypeError in random worker # when another one picekd up a task to build # why? try: task = self.task_queue.dequeue() except TypeError: return if not task: return job = BuildJob(task.data, self.opts) self.update_process_title( suffix="Task: {} chroot: {}, obtained at {}".format( job.build_id, job.chroot, str(datetime.now()))) return job def do_job(self, job): """ Executes new job. :param job: :py:class:`~backend.job.BuildJob` """ self._announce_start(job) self.update_process_title( suffix="Task: {} chroot: {} build started".format( job.build_id, job.chroot)) status = BuildStatus.SUCCEEDED # setup our target dir locally if not os.path.exists(job.chroot_dir): try: os.makedirs(job.chroot_dir) except (OSError, IOError): self.log.exception( "Could not make results dir for job: {}".format( job.chroot_dir)) status = BuildStatus.FAILURE self.clean_result_directory(job) if status == BuildStatus.SUCCEEDED: # FIXME # need a plugin hook or some mechanism to check random # info about the pkgs # this should use ansible to download the pkg on # the remote system # and run a series of checks on the package before we # start the build - most importantly license checks. self.log.info("Starting build: id={} builder={} job: {}".format( job.build_id, self.vm_ip, job)) with local_file_logger("{}.builder.mr".format(self.logger_name), job.chroot_log_path, fmt=build_log_format) as build_logger: try: mr = MockRemote(builder_host=self.vm_ip, job=job, logger=build_logger, opts=self.opts) mr.check() build_details = mr.build_pkg_and_process_results() job.update(build_details) if self.opts.do_sign: mr.add_pubkey() register_build_result(self.opts) except MockRemoteError as e: # record and break self.log.exception( "Error during the build, host={}, build_id={}, chroot={}, error: {}" .format(self.vm_ip, job.build_id, job.chroot, e)) status = BuildStatus.FAILURE register_build_result(self.opts, failed=True) self.log.info( "Finished build: id={} builder={} timeout={} destdir={}" " chroot={} repos={}".format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) self.copy_mock_logs(job) job.status = status self._announce_end(job) self.update_process_title( suffix="Task: {} chroot: {} done".format(job.build_id, job.chroot)) def copy_mock_logs(self, job): if not os.path.isdir(job.results_dir): self.log.info( "Job results dir doesn't exists, couldn't copy main log; path: {}" .format(job.results_dir)) return log_names = [(job.chroot_log_name, "mockchain.log.gz"), (job.rsync_log_name, "rsync.log.gz")] for src_name, dst_name in log_names: src = os.path.join(job.chroot_dir, src_name) dst = os.path.join(job.results_dir, dst_name) try: with open(src, "rb") as f_src, gzip.open(dst, "wb") as f_dst: f_dst.writelines(f_src) except IOError: self.log.info("File {} not found".format(src)) def clean_result_directory(self, job): """ Create backup directory and move there results from previous build. """ if not os.path.exists(job.results_dir) or os.listdir( job.results_dir) == []: return backup_dir_name = "prev_build_backup" backup_dir = os.path.join(job.results_dir, backup_dir_name) self.log.info( "Cleaning target directory, results from previous build storing in {}" .format(backup_dir)) if not os.path.exists(backup_dir): os.makedirs(backup_dir) files = (x for x in os.listdir(job.results_dir) if x != backup_dir_name) for filename in files: file_path = os.path.join(job.results_dir, filename) if os.path.isfile(file_path): if file_path.endswith((".info", ".log", ".log.gz")): os.rename(file_path, os.path.join(backup_dir, filename)) elif not file_path.endswith(".rpm"): os.remove(file_path) else: shutil.rmtree(file_path) def update_process_title(self, suffix=None): title = "worker-{} {} ".format(self.group_name, self.worker_num) if self.vm_ip: title += "VM_IP={} ".format(self.vm_ip) if self.vm_name: title += "VM_NAME={} ".format(self.vm_name) if suffix: title += str(suffix) setproctitle(title) def notify_job_grab_about_task_end(self, job, do_reschedule=False): # TODO: Current notification method is unreliable, # we should retask and use redis + lua for atomic acquire/release tasks request = { "action": "reschedule" if do_reschedule else "remove", "build_id": job.build_id, "task_id": job.task_id, "chroot": job.chroot, } self.rc.publish(JOB_GRAB_TASK_END_PUBSUB, json.dumps(request)) def acquire_vm_for_job(self, job): # TODO: replace acquire/release with context manager self.log.info("got job: {}, acquiring VM for build".format(str(job))) start_vm_wait_time = time.time() vmd = None while vmd is None: try: self.update_process_title( suffix="trying to acquire VM for job {} for {}s".format( job.task_id, time.time() - start_vm_wait_time)) vmd = self.vmm.acquire_vm(self.group_id, job.project_owner, os.getpid(), job.task_id, job.build_id, job.chroot) except NoVmAvailable as error: self.log.debug("No VM yet: {}".format(error)) time.sleep(self.opts.sleeptime) continue except Exception as error: self.log.exception( "Unhandled exception during VM acquire :{}".format(error)) break return vmd def run_cycle(self): self.update_process_title(suffix="trying to acquire job") time.sleep(self.opts.sleeptime) job = self.obtain_job() if not job: return try: if not self.starting_build(job): self.notify_job_grab_about_task_end(job) return except Exception: self.log.exception("Failed to check if job can be started") self.notify_job_grab_about_task_end(job) return vmd = self.acquire_vm_for_job(job) if vmd is None: self.notify_job_grab_about_task_end(job, do_reschedule=True) else: self.log.info("acquired VM: {} ip: {} for build {}".format( vmd.vm_name, vmd.vm_ip, job.task_id)) # TODO: store self.vmd = vmd and use it self.vm_name = vmd.vm_name self.vm_ip = vmd.vm_ip try: self.do_job(job) self.notify_job_grab_about_task_end(job) except VmError as error: self.log.exception( "Builder error, re-scheduling task: {}".format(error)) self.notify_job_grab_about_task_end(job, do_reschedule=True) except Exception as error: self.log.exception("Unhandled build error: {}".format(error)) self.notify_job_grab_about_task_end(job, do_reschedule=True) finally: # clean up the instance self.vmm.release_vm(vmd.vm_name) self.vm_ip = None self.vm_name = None def run(self): self.log.info("Starting worker") self.init_fedmsg() self.vmm.post_init() self.rc = get_redis_connection(self.opts) self.update_process_title(suffix="trying to acquire job") while not self.kill_received: self.run_cycle()
#!/usr/bin/python # coding: utf-8 NUM_QUEUES = 2 import sys sys.path.append("/usr/share/copr/") from retask.task import Task from retask.queue import Queue from backend.helpers import BackendConfigReader opts = BackendConfigReader().read() redis_config = { 'host': opts['redis_host'], 'port': opts['redis_port'], 'db': opts['redis_db'], } for i in range(0, NUM_QUEUES): print("## Queue {}".format(i)) q = Queue("copr-be-{}".format(i), config=redis_config) q.connect() save_q = [] while q.length != 0: task = q.dequeue() print task.data save_q.append(task) for t in save_q: q.enqueue(t)
class Worker(multiprocessing.Process): """ Worker process dispatches building tasks. Backend spin-up multiple workers, each worker associated to one group_id and process one task at the each moment. Worker listens for the new tasks from :py:class:`retask.Queueu` associated with its group_id :param Bunch opts: backend config :param queue: (:py:class:`multiprocessing.Queue`) queue to announce new events :param int worker_num: worker number :param int group_id: group_id from the set of groups defined in config :param callback: callback object to handle internal workers events. Should implement method ``log(msg)``. :param lock: (:py:class:`multiprocessing.Lock`) global backend lock """ def __init__(self, opts, events, worker_num, group_id, callback=None, lock=None): # base class initialization multiprocessing.Process.__init__(self, name="worker-builder") self.opts = opts # job management stuff self.task_queue = Queue("copr-be-{0}".format(str(group_id))) self.task_queue.connect() # event queue for communicating back to dispatcher self.events = events self.worker_num = worker_num self.group_id = group_id self.kill_received = False self.lock = lock self.frontend_callback = FrontendClient(opts, events) self.callback = callback if not self.callback: log_name = "worker-{0}-{1}.log".format(self.group_name, self.worker_num) self.logfile = os.path.join(self.opts.worker_logdir, log_name) self.callback = WorkerCallback(logfile=self.logfile) self.vm_name = None self.vm_ip = None self.callback.log("creating worker: dynamic ip") @property def group_name(self): try: return self.opts.build_groups[self.group_id]["name"] except Exception as error: self.callback.log( "Failed to get builder group name from config, using group_id as name." "Original error: {}".format(error)) return self.group_id def event(self, topic, template, content=None): """ Multi-purpose logging method. Logs messages to three different destinations: - To log file - The internal "events" queue for communicating back to the dispatcher. - The fedmsg bus. Messages are posted asynchronously to a zmq.PUB socket. """ content = content or {} what = template.format(**content) who = "worker-{0}".format(self.worker_num) self.callback.log("event: who: {0}, what: {1}".format(who, what)) self.events.put({"when": time.time(), "who": who, "what": what}) if self.opts.fedmsg_enabled and fedmsg: content["who"] = who content["what"] = what try: fedmsg.publish(modname="copr", topic=topic, msg=content) # pylint: disable=W0703 except Exception as e: # XXX - Maybe log traceback as well with traceback.format_exc() self.callback.log("failed to publish message: {0}".format(e)) def _announce_start(self, job): """ Announce everywhere that a build process started now. """ job.started_on = time.time() self.mark_started(job) template = "build start: user:{user} copr:{copr}" \ "pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.pkg_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.event("build.start", template, content) template = "chroot start: chroot:{chroot} user:{user}" \ "copr:{copr} pkg: {pkg} build:{build} ip:{ip} pid:{pid}" content = dict(chroot=job.chroot, user=job.submitter, owner=job.project_owner, pkg=job.pkg_name, copr=job.project_name, build=job.build_id, ip=self.vm_ip, pid=self.pid) self.event("chroot.start", template, content) def _announce_end(self, job): """ Announce everywhere that a build process ended now. """ job.ended_on = time.time() self.return_results(job) self.callback.log("worker finished build: {0}".format(self.vm_ip)) template = "build end: user:{user} copr:{copr} build:{build}" \ " pkg: {pkg} version: {version} ip:{ip} pid:{pid} status:{status}" content = dict(user=job.submitter, copr=job.project_name, owner=job.project_owner, pkg=job.pkg_name, version=job.pkg_version, build=job.build_id, ip=self.vm_ip, pid=self.pid, status=job.status, chroot=job.chroot) self.event("build.end", template, content) def run_ansible_playbook(self, args, name="running playbook", attempts=9): """ Call ansible playbook: - well mostly we run out of space in OpenStack so we rather try multiple times (attempts param) - dump any attempt failure """ # Ansible playbook python API does not work here, dunno why. See: # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 command = "{0} {1}".format(ansible_playbook, args) result = None for i in range(0, attempts): try: attempt_desc = ": retry: " if i > 0 else ": begin: " self.callback.log(name + attempt_desc + command) result = subprocess.check_output(command, shell=True) self.callback.log("Raw playbook output:\n{0}\n".format(result)) break except CalledProcessError as e: self.callback.log("CalledProcessError: \n{0}\n".format( e.output)) sys.stderr.write("{0}\n".format(e.output)) # FIXME: this is not purpose of opts.sleeptime time.sleep(self.opts.sleeptime) self.callback.log(name + ": end") return result def validate_vm(self): """ Test connectivity to the VM :param ipaddr: ip address to the newly created VM :raises: :py:class:`~backend.exceptions.CoprWorkerSpawnFailError`: validation fails """ # we were getting some dead instances # that's why I'm testing the connectivity here runner_options = dict(remote_user="******", host_list="{},".format(self.vm_ip), pattern=self.vm_ip, forks=1, transport=self.opts.ssh.transport, timeout=500) connection = ansible.runner.Runner(**runner_options) connection.module_name = "shell" connection.module_args = "echo hello" try: res = connection.run() except Exception as exception: raise CoprWorkerSpawnFailError("Failed to check created VM ({})" "due to ansible error: {}".format( self.vm_ip, exception)) if self.vm_ip not in res.get("contacted", {}): self.callback.log( "Worker is not responding to the testing playbook. Terminating it." "Runner options: {}".format(runner_options) + "Ansible raw response:\n{}".format(res)) raise CoprWorkerSpawnFailError("Created VM ({}) was unresponsive " "and therefore terminated".format( self.vm_ip)) def try_spawn(self, args): """ Tries to spawn new vm using ansible :param args: ansible for ansible command which spawns VM :return str: valid ip address of new machine (nobody guarantee machine availability) """ result = self.run_ansible_playbook(args, "spawning instance") if not result: raise CoprWorkerSpawnFailError("No result, trying again") match = re.search(r'IP=([^\{\}"]+)', result, re.MULTILINE) if not match: raise CoprWorkerSpawnFailError("No ip in the result, trying again") ipaddr = match.group(1) match = re.search(r'vm_name=([^\{\}"]+)', result, re.MULTILINE) if match: self.vm_name = match.group(1) self.callback.log("got instance ip: {0}".format(ipaddr)) try: IP(ipaddr) except ValueError: # if we get here we"re in trouble msg = "Invalid IP back from spawn_instance - dumping cache output\n" msg += str(result) raise CoprWorkerSpawnFailError(msg) return ipaddr def spawn_instance(self): """ Spawn new VM, executing the following steps: - call the spawn playbook to startup/provision a building instance - get an IP and test if the builder responds - repeat this until you get an IP of working builder :param BuildJob job: :return ip: of created VM :return None: if couldn't find playbook to spin ip VM """ start = time.time() # Ansible playbook python API does not work here, dunno why. See: # https://groups.google.com/forum/#!topic/ansible-project/DNBD2oHv5k8 try: spawn_playbook = self.opts.build_groups[ self.group_id]["spawn_playbook"] except KeyError: return spawn_args = "-c ssh {}".format(spawn_playbook) # TODO: replace with for i in range(MAX_SPAWN_TRIES): ... else raise FatalError i = 0 while self.vm_ip is None: i += 1 try: self.callback.log("Spawning a builder. Try No. {0}".format(i)) self.vm_ip = self.try_spawn(spawn_args) self.update_process_title() try: self.validate_vm() except CoprWorkerSpawnFailError: self.terminate_instance() raise self.callback.log( "Instance spawn/provision took {0} sec".format( time.time() - start)) except CoprWorkerSpawnFailError as exception: self.callback.log( "VM Spawn attempt failed with message: {}".format( exception.msg)) def terminate_instance(self): """ Call the terminate playbook to destroy the building instance """ self.update_process_title(suffix="Terminating VM") term_args = {} if "ip" in self.opts.terminate_vars: term_args["ip"] = self.vm_ip if "vm_name" in self.opts.terminate_vars: term_args["vm_name"] = self.vm_name try: playbook = self.opts.build_groups[ self.group_id]["terminate_playbook"] except KeyError: self.callback.log( "Fatal error: no terminate playbook for group_id: {}; exiting". format(self.group_id)) sys.exit(255) # args = "-c ssh -i '{0},' {1} {2}".format( args = "-c ssh {} {}".format( # self.vm_ip, playbook, ans_extra_vars_encode(term_args, "copr_task")) try: self.run_ansible_playbook(args, "terminate instance") except Exception as error: self.callback.log( "Failed to terminate an instance: vm_name={}, vm_ip={}. Original error: {}" .format(self.vm_name, self.vm_ip, error)) # TODO: should we check that machine was destroyed? self.vm_ip = None self.vm_name = None self.update_process_title() def mark_started(self, job): """ Send data about started build to the frontend """ job.status = 3 # running build = job.to_dict() self.callback.log("build: {}".format(build)) data = {"builds": [build]} try: self.frontend_callback.update(data) except: raise CoprWorkerError( "Could not communicate to front end to submit status info") def return_results(self, job): """ Send the build results to the frontend """ self.callback.log("{0} status {1}. Took {2} seconds".format( job.build_id, job.status, job.ended_on - job.started_on)) self.callback.log("build: {}".format(job.to_dict())) data = {"builds": [job.to_dict()]} try: self.frontend_callback.update(data) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}". format(err)) def starting_build(self, job): """ Announce to the frontend that a build is starting. :return True: if the build can start :return False: if the build can not start (build is cancelled) """ try: can_start = self.frontend_callback.starting_build( job.build_id, job.chroot) except Exception as err: raise CoprWorkerError( "Could not communicate to front end to submit results: {}". format(err)) return can_start @classmethod def pkg_built_before(cls, pkg, chroot, destdir): """ Check whether the package has already been built in this chroot. """ s_pkg = os.path.basename(pkg) pdn = s_pkg.replace(".src.rpm", "") resdir = "{0}/{1}/{2}".format(destdir, chroot, pdn) resdir = os.path.normpath(resdir) if os.path.exists(resdir) and os.path.exists( os.path.join(resdir, "success")): return True return False def spawn_instance_with_check(self): """ Wrapper around self.spawn_instance() with exception checking :param BuildJob job: :return str: ip of spawned vm :raises: - :py:class:`~backend.exceptions.CoprWorkerError`: spawn function doesn't return ip - :py:class:`AnsibleError`: failure during anible command execution """ self.update_process_title(suffix="Spawning a new VM") try: self.spawn_instance() if not self.vm_ip: # TODO: maybe add specific exception? raise CoprWorkerError("No IP found from creating instance") except AnsibleError as e: register_build_result(self.opts, failed=True) self.callback.log("failure to setup instance: {0}".format(e)) raise def init_fedmsg(self): """ Initialize Fedmsg (this assumes there are certs and a fedmsg config on disk) """ if not (self.opts.fedmsg_enabled and fedmsg): return try: fedmsg.init(name="relay_inbound", cert_prefix="copr", active=True) except Exception as e: self.callback.log("failed to initialize fedmsg: {0}".format(e)) def on_pkg_skip(self, job): """ Handle package skip """ self._announce_start(job) self.callback.log( "Skipping: package {0} has been already built before.".format( job.pkg)) job.status = BuildStatus.SKIPPED # skipped self._announce_end(job) def obtain_job(self): """ Retrieves new build task from queue. Checks if the new job can be started and not skipped. """ self.update_process_title(suffix="No task") # this sometimes caused TypeError in random worker # when another one picekd up a task to build # why? try: task = self.task_queue.dequeue() except TypeError: return if not task: return # import ipdb; ipdb.set_trace() job = BuildJob(task.data, self.opts) self.update_process_title( suffix="Task: {} chroot: {}".format(job.build_id, job.chroot)) # Checking whether the build is not cancelled if not self.starting_build(job): return # Checking whether to build or skip if self.pkg_built_before(job.pkg, job.chroot, job.destdir): self.on_pkg_skip(job) return # FIXME # this is our best place to sanity check the job before starting # up any longer process return job def do_job(self, job): """ Executes new job. :param job: :py:class:`~backend.job.BuildJob` """ self._announce_start(job) status = BuildStatus.SUCCEEDED chroot_destdir = os.path.normpath(job.destdir + '/' + job.chroot) # setup our target dir locally if not os.path.exists(chroot_destdir): try: os.makedirs(chroot_destdir) except (OSError, IOError) as e: msg = "Could not make results dir" \ " for job: {0} - {1}".format(chroot_destdir, str(e)) self.callback.log(msg) status = BuildStatus.FAILURE if status == BuildStatus.SUCCEEDED: # FIXME # need a plugin hook or some mechanism to check random # info about the pkgs # this should use ansible to download the pkg on # the remote system # and run a series of checks on the package before we # start the build - most importantly license checks. self.callback.log( "Starting build: id={0} builder={1} timeout={2} destdir={3}" " chroot={4} repos={5}".format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) self.callback.log("Building pkgs: {0}".format(job.pkg)) chroot_repos = list(job.repos) chroot_repos.append(job.results + job.chroot + '/') chroot_repos.append(job.results + job.chroot + '/devel/') chroot_logfile = "{0}/build-{1}.log".format( chroot_destdir, job.build_id) macros = { "copr_username": job.project_owner, "copr_projectname": job.project_name, "vendor": "Fedora Project COPR ({0}/{1})".format(job.project_owner, job.project_name) } try: mr = MockRemote( builder_host=self.vm_ip, job=job, repos=chroot_repos, macros=macros, opts=self.opts, lock=self.lock, callback=CliLogCallBack(quiet=True, logfn=chroot_logfile), ) mr.check() build_details = mr.build_pkg() job.update(build_details) if self.opts.do_sign: mr.add_pubkey() register_build_result(self.opts) except MockRemoteError as e: # record and break self.callback.log("{0} - {1}".format(self.vm_ip, e)) status = BuildStatus.FAILURE register_build_result(self.opts, failed=True) self.callback.log( "Finished build: id={0} builder={1} timeout={2} destdir={3}" " chroot={4} repos={5}".format(job.build_id, self.vm_ip, job.timeout, job.destdir, job.chroot, str(job.repos))) job.status = status self._announce_end(job) self.update_process_title( suffix="Task: {} chroot: {} done".format(job.build_id, job.chroot)) def check_vm_still_alive(self): """ Ensure that if we have vm_ip it is alive. Terminates unresponsive instance. """ if self.vm_ip: # TODO: extract method: check_vm_still_alive try: self.validate_vm() except CoprWorkerSpawnFailError: self.terminate_instance() def update_process_title(self, suffix=None): title = "worker-{} {} ".format(self.group_name, self.worker_num) if self.vm_ip: title += "VM_IP={} ".format(self.vm_ip) if self.vm_name: title += "VM_NAME={} ".format(self.vm_name) if suffix: title += str(suffix) setproctitle(title) def run(self): """ Worker should startup and check if it can function for each job it takes from the jobs queue run opts.setup_playbook to create the instance do the build (mockremote) terminate the instance. """ self.init_fedmsg() while not self.kill_received: self.update_process_title() self.check_vm_still_alive() if self.opts.spawn_in_advance and not self.vm_ip: self.spawn_instance_with_check() job = self.obtain_job() if not job: time.sleep(self.opts.sleeptime) continue if not self.vm_ip: self.spawn_instance_with_check() try: self.do_job(job) except Exception as error: self.callback.log("Unhandled build error: {}".format(error)) finally: # clean up the instance self.terminate_instance()
# coding: utf-8 NUM_QUEUES = 2 import sys sys.path.append("/usr/share/copr/") from retask.task import Task from retask.queue import Queue from backend.helpers import BackendConfigReader opts = BackendConfigReader().read() redis_config = { 'host': opts['redis_host'], 'port': opts['redis_port'], 'db': opts['redis_db'], } for i in range(0, NUM_QUEUES): print("## Queue {}".format(i)) q = Queue("copr-be-{}".format(i), config=redis_config) q.connect() save_q = [] while q.length != 0: task = q.dequeue() print task.data save_q.append(task) for t in save_q: q.enqueue(t)
from retask.task import Task from retask.queue import Queue queue = Queue('example') queue.connect() while queue.length != 0: task = queue.dequeue() print task.data