def prep_job(job_config, log_file_path, archive_dir): job_id = job_config['job_id'] safe_archive = safepath.munge(job_config['name']) job_config['worker_log'] = log_file_path archive_path_full = os.path.join(archive_dir, safe_archive, str(job_id)) job_config['archive_path'] = archive_path_full # If the teuthology branch was not specified, default to master and # store that value. teuthology_branch = job_config.get('teuthology_branch', 'master') job_config['teuthology_branch'] = teuthology_branch teuthology_sha1 = job_config.get('teuthology_sha1') if not teuthology_sha1: repo_url = build_git_url('teuthology', 'ceph') teuthology_sha1 = ls_remote(repo_url, teuthology_branch) if not teuthology_sha1: reason = "Teuthology branch {} not found; marking job as dead".format( teuthology_branch) log.error(reason) report.try_push_job_info( job_config, dict(status='dead', failure_reason=reason)) raise SkipJob() log.info('Using teuthology sha1 %s', teuthology_sha1) try: if teuth_config.teuthology_path is not None: teuth_path = teuth_config.teuthology_path else: teuth_path = fetch_teuthology(branch=teuthology_branch, commit=teuthology_sha1) # For the teuthology tasks, we look for suite_branch, and if we # don't get that, we look for branch, and fall back to 'master'. # last-in-suite jobs don't have suite_branch or branch set. ceph_branch = job_config.get('branch', 'master') suite_branch = job_config.get('suite_branch', ceph_branch) suite_sha1 = job_config.get('suite_sha1') suite_repo = job_config.get('suite_repo') if suite_repo: teuth_config.ceph_qa_suite_git_url = suite_repo job_config['suite_path'] = os.path.normpath( os.path.join( fetch_qa_suite(suite_branch, suite_sha1), job_config.get('suite_relpath', ''), )) except (BranchNotFoundError, CommitNotFoundError) as exc: log.exception("Requested version not found; marking job as dead") report.try_push_job_info(job_config, dict(status='dead', failure_reason=str(exc))) raise SkipJob() except MaxWhileTries as exc: log.exception("Failed to fetch or bootstrap; marking job as dead") report.try_push_job_info(job_config, dict(status='dead', failure_reason=str(exc))) raise SkipJob() teuth_bin_path = os.path.join(teuth_path, 'virtualenv', 'bin') if not os.path.isdir(teuth_bin_path): raise RuntimeError("teuthology branch %s at %s not bootstrapped!" % (teuthology_branch, teuth_bin_path)) return job_config, teuth_bin_path
def fetch_repos(branch, test_name): """ Fetch the suite repo (and also the teuthology repo) so that we can use it to build jobs. Repos are stored in ~/src/. The reason the teuthology repo is also fetched is that currently we use subprocess to call teuthology-schedule to schedule jobs so we need to make sure it is up-to-date. For that reason we always fetch the master branch for test scheduling, regardless of what teuthology branch is requested for testing. :returns: The path to the suite repo on disk """ try: # When a user is scheduling a test run from their own copy of # teuthology, let's not wreak havoc on it. if config.automated_scheduling: # We use teuthology's master branch in all cases right now if config.teuthology_path is None: fetch_teuthology('master') suite_repo_path = fetch_qa_suite(branch) except BranchNotFoundError as exc: schedule_fail(message=str(exc), name=test_name) return suite_repo_path
def main(ctx): loglevel = logging.INFO if ctx.verbose: loglevel = logging.DEBUG log.setLevel(loglevel) log_file_path = os.path.join( ctx.log_dir, 'worker.{tube}.{pid}'.format( pid=os.getpid(), tube=ctx.tube, )) setup_log_file(log_file_path) install_except_hook() load_config(ctx=ctx) set_config_attr(ctx) connection = beanstalk.connect() beanstalk.watch_tube(connection, ctx.tube) result_proc = None if teuth_config.teuthology_path is None: fetch_teuthology('master') fetch_qa_suite('master') keep_running = True while keep_running: # Check to see if we have a teuthology-results process hanging around # and if so, read its return code so that it can exit. if result_proc is not None and result_proc.poll() is not None: log.debug("teuthology-results exited with code: %s", result_proc.returncode) result_proc = None if sentinel(restart_file_path): restart() elif sentinel(stop_file_path): stop() load_config() job = connection.reserve(timeout=60) if job is None: continue # bury the job so it won't be re-run if it fails job.bury() job_id = job.jid log.info('Reserved job %d', job_id) log.info('Config is: %s', job.body) job_config = yaml.safe_load(job.body) job_config['job_id'] = str(job_id) if job_config.get('stop_worker'): keep_running = False try: job_config, teuth_bin_path = prep_job( job_config, log_file_path, ctx.archive_dir, ) run_job( job_config, teuth_bin_path, ctx.archive_dir, ctx.verbose, ) except SkipJob: continue # This try/except block is to keep the worker from dying when # beanstalkc throws a SocketError try: job.delete() except Exception: log.exception("Saw exception while trying to delete job")
def main(args): # run dispatcher in job supervisor mode if --supervisor passed if args["--supervisor"]: return supervisor.main(args) verbose = args["--verbose"] tube = args["--tube"] log_dir = args["--log-dir"] archive_dir = args["--archive-dir"] if archive_dir is None: archive_dir = teuth_config.archive_base # setup logging for disoatcher in {log_dir} loglevel = logging.INFO if verbose: loglevel = logging.DEBUG log.setLevel(loglevel) log_file_path = os.path.join(log_dir, f"dispatcher.{tube}.{os.getpid()}") setup_log_file(log_file_path) install_except_hook() load_config(archive_dir=archive_dir) connection = beanstalk.connect() beanstalk.watch_tube(connection, tube) result_proc = None if teuth_config.teuthology_path is None: fetch_teuthology('master') fetch_qa_suite('master') keep_running = True while keep_running: # Check to see if we have a teuthology-results process hanging around # and if so, read its return code so that it can exit. if result_proc is not None and result_proc.poll() is not None: log.debug("teuthology-results exited with code: %s", result_proc.returncode) result_proc = None if sentinel(restart_file_path): restart() elif sentinel(stop_file_path): stop() load_config() job = connection.reserve(timeout=60) if job is None: continue # bury the job so it won't be re-run if it fails job.bury() job_id = job.jid log.info('Reserved job %d', job_id) log.info('Config is: %s', job.body) job_config = yaml.safe_load(job.body) job_config['job_id'] = str(job_id) if job_config.get('stop_worker'): keep_running = False try: job_config, teuth_bin_path = prep_job( job_config, log_file_path, archive_dir, ) except SkipJob: continue # lock machines but do not reimage them if 'roles' in job_config: job_config = lock_machines(job_config) run_args = [ os.path.join(teuth_bin_path, 'teuthology-dispatcher'), '--supervisor', '-v', '--bin-path', teuth_bin_path, '--archive-dir', archive_dir, ] # Create run archive directory if not already created and # job's archive directory create_job_archive(job_config['name'], job_config['archive_path'], archive_dir) job_config_path = os.path.join(job_config['archive_path'], 'orig.config.yaml') # Write initial job config in job archive dir with open(job_config_path, 'w') as f: yaml.safe_dump(job_config, f, default_flow_style=False) run_args.extend(["--job-config", job_config_path]) try: job_proc = subprocess.Popen(run_args) log.info('Job supervisor PID: %s', job_proc.pid) except Exception: error_message = "Saw error while trying to spawn supervisor." log.exception(error_message) if 'targets' in job_config: nuke(supervisor.create_fake_context(job_config), True) report.try_push_job_info( job_config, dict(status='fail', failure_reason=error_message)) # This try/except block is to keep the worker from dying when # beanstalkc throws a SocketError try: job.delete() except Exception: log.exception("Saw exception while trying to delete job")