def check_logs(job_defs): stalled_jobs = set() for job_def in job_defs: try: log_lines = get_job_log(job_def, write_file=False) jid = job_def['jobId'] now = datetime.now() if jid not in job_log_dict.keys(): logger.info("Adding job %s to the log tracker at %s." % (jid, now)) job_log_dict[jid] = {'log': log_lines, 'check_time': now} elif len(job_log_dict[jid]['log']) == len(log_lines): check_dt = now - job_log_dict[jid]['check_time'] logger.warning(('Job \'%s\' has not produced output for ' '%d seconds.') % (job_def['jobName'], check_dt.seconds)) if check_dt.seconds > idle_log_timeout: logger.warning("Job \'%s\' has stalled." % job_def['jobName']) stalled_jobs.add(jid) else: old_log = job_log_dict[jid]['log'] old_log += log_lines[len(old_log):] job_log_dict[jid]['check_time'] = now except Exception as e: logger.error("Failed to check log for: %s" % str(job_def)) logger.exception(e) return stalled_jobs
def stash_logs(job_defs, success_jobs, failure_jobs, queue_name, method='local', job_name_prefix=None, tag='stash', ids_stashed=None): if ids_stashed is None: ids_stashed = set() success_ids = _get_job_ids_to_stash(success_jobs, ids_stashed) failure_ids = _get_job_ids_to_stash(failure_jobs, ids_stashed) if method == 's3': s3_client = boto3.client('s3') def stash_log(log_str, name_base): name = '%s_%s.log' % (name_base, tag) s3_client.put_object(Bucket=bucket_name, Key='reading_results/%s/logs/%s/%s' % (job_name_prefix, queue_name, name), Body=log_str) elif method == 'local': if job_name_prefix is None: job_name_prefix = 'batch_%s' % tag dirname = '%s_job_logs' % job_name_prefix os.mkdir(dirname) def stash_log(log_str, name_base): with open(os.path.join(dirname, name_base + '.log'), 'w') as f: f.write(log_str) else: raise ValueError('Invalid method: %s' % method) for jobId, job_def_tpl in job_defs.items(): if jobId not in success_ids and jobId not in failure_ids: continue # Logs aren't done and ready to be loaded. try: job_def = dict(job_def_tpl) lines = get_job_log(job_def, write_file=False) if lines is None: logger.warning("No logs found for %s." % job_def['jobName']) continue log_str = ''.join(lines) base_name = job_def['jobName'] if job_def['jobId'] in success_ids: base_name += '/SUCCESS' elif job_def['jobId'] in failure_ids: base_name += '/FAILED' else: logger.error("Job cannot be logged unless completed.") continue logger.info('Stashing ' + base_name) stash_log(log_str, base_name) except Exception as e: logger.error("Failed to save logs for: %s" % str(job_def_tpl)) logger.exception(e) ids_stashed |= {jid for jids in [success_ids, failure_ids] for jid in jids} return
def stash_logs(job_defs, success_jobs, failure_jobs, queue_name, method='local', job_name_prefix=None, tag='stash', ids_stashed=None): if ids_stashed is None: ids_stashed = set() success_ids = _get_job_ids_to_stash(success_jobs, ids_stashed) failure_ids = _get_job_ids_to_stash(failure_jobs, ids_stashed) if method == 's3': s3_client = boto3.client('s3') def stash_log(log_str, name_base): name = '%s_%s.log' % (name_base, tag) s3_client.put_object( Bucket=bucket_name, Key='reading_results/%s/logs/%s/%s' % ( job_name_prefix, queue_name, name), Body=log_str ) elif method == 'local': if job_name_prefix is None: job_name_prefix = 'batch_%s' % tag dirname = '%s_job_logs' % job_name_prefix os.mkdir(dirname) def stash_log(log_str, name_base): with open(os.path.join(dirname, name_base + '.log'), 'w') as f: f.write(log_str) else: raise ValueError('Invalid method: %s' % method) for jobId, job_def_tpl in job_defs.items(): if jobId not in success_ids and jobId not in failure_ids: continue # Logs aren't done and ready to be loaded. try: job_def = dict(job_def_tpl) lines = get_job_log(job_def, write_file=False) if lines is None: logger.warning("No logs found for %s." % job_def['jobName']) continue log_str = ''.join(lines) base_name = job_def['jobName'] if job_def['jobId'] in success_ids: base_name += '/SUCCESS' elif job_def['jobId'] in failure_ids: base_name += '/FAILED' else: logger.error("Job cannot be logged unless completed.") continue logger.info('Stashing ' + base_name) stash_log(log_str, base_name) except Exception as e: logger.error("Failed to save logs for: %s" % str(job_def_tpl)) logger.exception(e) ids_stashed |= {jid for jids in [success_ids, failure_ids] for jid in jids} return
def check_logs(job_defs): """Updates teh job_log_dict.""" stalled_jobs = set() # Check the status of all the jobs we're tracking. for job_def in job_defs: try: # Get the logs for this job. log_lines = get_job_log(job_def, write_file=False) # Get the job id. jid = job_def['jobId'] now = datetime.now() if jid not in job_log_dict.keys(): # If the job is new... logger.info("Adding job %s to the log tracker at %s." % (jid, now)) job_log_dict[jid] = { 'log': log_lines, 'last change time': now } elif len(job_log_dict[jid]['log']) == len(log_lines): # If the job log hasn't changed, announce as such, and # check to see if it has been the same for longer than # stall time. check_dt = now - job_log_dict[jid]['last change time'] logger.warning(('Job \'%s\' has not produced output for ' '%d seconds.') % (job_def['jobName'], check_dt.seconds)) if check_dt.seconds > idle_log_timeout: logger.warning("Job \'%s\' has stalled." % job_def['jobName']) stalled_jobs.add(jid) else: # If the job is known, and the logs have changed, update # the "last change time". old_log = job_log_dict[jid]['log'] old_log += log_lines[len(old_log):] job_log_dict[jid]['last change time'] = now except Exception as e: # Sometimes due to sync et al. issues, a part of this will fail # Such things are usually transitory issues so we keep trying. logger.error("Failed to check log for: %s" % str(job_def)) logger.exception(e) # Pass up the set of job id's for stalled jobs. return stalled_jobs
def stash_logs(job_defs, success_ids, failure_ids, queue_name, method='local', job_name_prefix=None, tag='stash'): if method == 's3': s3_client = boto3.client('s3') def stash_log(log_str, name_base): name = '%s_%s.log' % (name_base, tag) s3_client.put_object(Bucket=bucket_name, Key='reading_results/%s/logs/%s/%s' % (job_name_prefix, queue_name, name), Body=log_str) elif method == 'local': if job_name_prefix is None: job_name_prefix = 'batch_%s' % tag dirname = '%s_job_logs' % job_name_prefix os.mkdir(dirname) def stash_log(log_str, name_base): with open(os.path.join(dirname, name_base + '.log'), 'w') as f: f.write(log_str) for job_def_tpl in job_defs: try: job_def = dict(job_def_tpl) lines = get_job_log(job_def, write_file=False) if lines is None: logger.warning("No logs found for %s." % job_def['jobName']) continue log_str = ''.join(lines) base_name = job_def['jobName'] if job_def['jobId'] in success_ids: base_name += '_SUCCESS' elif job_def['jobId'] in failure_ids: base_name += '_FAILED' logger.info('Stashing ' + base_name) stash_log(log_str, base_name) except Exception as e: logger.error("Failed to save logs for: %s" % str(job_def_tpl)) logger.exception(e) return
def check_logs(job_defs): """Updates teh job_log_dict.""" stalled_jobs = set() # Check the status of all the jobs we're tracking. for job_def in job_defs: try: # Get the logs for this job. log_lines = get_job_log(job_def, write_file=False) # Get the job id. jid = job_def['jobId'] now = datetime.now() if jid not in job_log_dict.keys(): # If the job is new... logger.info("Adding job %s to the log tracker at %s." % (jid, now)) job_log_dict[jid] = {'log': log_lines, 'last change time': now} elif len(job_log_dict[jid]['log']) == len(log_lines): # If the job log hasn't changed, announce as such, and # check to see if it has been the same for longer than # stall time. check_dt = now - job_log_dict[jid]['last change time'] logger.warning(('Job \'%s\' has not produced output for ' '%d seconds.') % (job_def['jobName'], check_dt.seconds)) if check_dt.seconds > idle_log_timeout: logger.warning("Job \'%s\' has stalled." % job_def['jobName']) stalled_jobs.add(jid) else: # If the job is known, and the logs have changed, update # the "last change time". old_log = job_log_dict[jid]['log'] old_log += log_lines[len(old_log):] job_log_dict[jid]['last change time'] = now except Exception as e: # Sometimes due to sync et al. issues, a part of this will fail # Such things are usually transitory issues so we keep trying. logger.error("Failed to check log for: %s" % str(job_def)) logger.exception(e) # Pass up the set of job id's for stalled jobs. return stalled_jobs
def check_logs(job_defs): stalled_jobs = set() for job_def in job_defs: log_lines = get_job_log(job_def, write_file=False) jid = job_def['jobId'] now = datetime.now() if jid not in job_log_dict.keys(): job_log_dict[jid] = {'log': log_lines, 'check_time': now} elif len(job_log_dict[jid]['log']) == len(log_lines): check_dt = now - job_log_dict[jid]['check_time'] if check_dt.seconds > idle_log_timeout: logger.warning(('Job \'%s\' has not produced output for ' '%d seconds.') % (job_def['jobName'], check_dt.seconds)) stalled_jobs.add(jid) else: old_log = job_log_dict[jid]['log'] old_log += log_lines[len(old_log):] return stalled_jobs