def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) hit_max_timeout = False while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: hit_max_timeout = True log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) try: # kill processes but do not unlock yet so we can save # the logs, coredumps, etc. kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner'], save_logs=True) except Exception: log.exception('Failed to kill job') try: transfer_archives(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config) except Exception: log.exception('Could not save logs') try: # this time remove everything and unlock the machines kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) except Exception: log.exception('Failed to kill job and unlock machines') # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # we no longer support testing theses old branches assert(job_config.get('teuthology_branch') not in ('argonaut', 'bobtail', 'cuttlefish', 'dumpling')) # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. extra_info = dict(status='dead') if hit_max_timeout: extra_info['failure_reason'] = 'hit max job timeout' report.try_push_job_info(job_info, extra_info)
def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) symlink_worker_log(job_config['worker_log'], job_config['archive_path']) while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # The job finished. Let's make sure paddles knows. branches_sans_reporting = ('argonaut', 'bobtail', 'cuttlefish', 'dumpling') if job_config.get('teuthology_branch') in branches_sans_reporting: # The job ran with a teuthology branch that may not have the reporting # feature. Let's call teuthology-report (which will be from the master # branch) to report the job manually. cmd = "teuthology-report -v -D -r {run_name} -j {job_id}".format( run_name=job_info['name'], job_id=job_info['job_id']) try: log.info("Executing %s" % cmd) report_proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while report_proc.poll() is None: for line in report_proc.stdout.readlines(): log.info(line.strip()) time.sleep(1) log.info("Reported results via the teuthology-report command") except Exception: log.exception("teuthology-report failed") else: # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. report.try_push_job_info(job_info, dict(status='dead'))
def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) symlink_worker_log(job_config['worker_log'], job_config['archive_path']) while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # we no longer support testing theses old branches assert (job_config.get('teuthology_branch') not in ('argonaut', 'bobtail', 'cuttlefish', 'dumpling')) # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. report.try_push_job_info(job_info, dict(status='dead'))