def test_get_url_from_identifier_job(self): identifier = get_object_type_from_identifier( self.index, 'MASTER.namea', ) assert_equal(identifier.url, '/api/jobs/MASTER.namea') assert_equal(identifier.type, TronObjectType.job)
def test_get_url_from_identifier_action_run(self): identifier = get_object_type_from_identifier( self.index, 'MASTER.nameb.7.run', ) assert_equal(identifier.url, '/api/jobs/MASTER.nameb/7/run') assert_equal(identifier.type, TronObjectType.action_run)
def compute_check_result_for_job(client, job): kwargs = { "name": "check_tron_job.{}".format(job['name']), "source": "tron", } kwargs.update(job['monitoring']) if 'realert_every' not in kwargs: kwargs['realert_every'] = guess_realert_every(job) kwargs['check_every'] = "{}s".format(_run_interval) status = job["status"] if status == "disabled": kwargs["output"] = "OK: {} is disabled and won't be checked.".format( job['name'], ) kwargs["status"] = 0 log.info(kwargs["output"]) return kwargs else: # The job is not disabled, therefore we have to look at its run history url_index = client.index() tron_id = get_object_type_from_identifier(url_index, job["name"]) job_content = client.job( tron_id.url, count=20, include_action_runs=True, ) results = compute_check_result_for_job_runs( job=job, job_content=job_content, client=client, ) kwargs.update(results) log.info(kwargs["output"].split("\n")[0]) return kwargs
def compute_check_result_for_job(client, job): kwargs = m( name="check_tron_job.{}".format(job['name']), source="tron", ) if 'realert_every' not in kwargs: kwargs = kwargs.set('realert_every', guess_realert_every(job)) kwargs = kwargs.set('check_every', f"{_run_interval}s") # We want to prevent a monitoring config from setting the check_every # attribute, since one config should not dictate how often this script runs sensu_kwargs = ( pmap(job['monitoring']).discard(PRECIOUS_JOB_ATTR) .discard('check_every') ) kwargs = kwargs.update(sensu_kwargs) kwargs_list = [] if job["status"] == "disabled": kwargs = kwargs.set( 'output', "OK: {} is disabled and won't be checked.".format(job['name'], ) ) kwargs = kwargs.set('status', 0) kwargs_list.append(kwargs) else: # The job is not disabled, therefore we have to look at its run history url_index = client.index() tron_id = get_object_type_from_identifier(url_index, job["name"]) job_content = pmap( client.job( tron_id.url, include_action_runs=True, ) ) if job['monitoring'].get(PRECIOUS_JOB_ATTR, False): dated_runs = sort_runs_by_interval(job_content, interval='day') else: dated_runs = {'': job_content['runs']} for date, runs in dated_runs.items(): results = compute_check_result_for_job_runs( job=job, job_content=job_content.set('runs', runs), client=client, ) dated_kwargs = kwargs.update(results) if date: # if empty date, leave job name alone dated_kwargs = dated_kwargs.set( 'name', f"{kwargs['name']}-{date}" ) kwargs_list.append(dated_kwargs) return [dict(kws) for kws in kwargs_list]
def test_get_url_from_identifier_service_instance(self): identifier = get_object_type_from_identifier(self.index, 'MASTER.foo.1') assert_equal(identifier.url, '/api/services/MASTER.foo/1') assert_equal(identifier.type, TronObjectType.service_instance)
def test_get_url_from_identifier_service_no_namespace(self): identifier = get_object_type_from_identifier(self.index, 'foo') assert_equal(identifier.url, '/api/services/MASTER.foo') assert_equal(identifier.type, TronObjectType.service)
def compute_check_result_for_job_runs(client, job, job_content, url_index, hide_stderr=False): cluster = client.cluster_name kwargs = {} if job_content is None: kwargs[ "output"] = f"OK: {job['name']} was just added and hasn't run yet on {cluster}." kwargs["status"] = 0 return kwargs relevant_job_run, last_state = get_relevant_run_and_state(job_content) if relevant_job_run is None: kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \ f"run yet on {cluster}.\n{pretty_print_job(job_content)}" kwargs["status"] = 2 return kwargs else: # if no run scheduled, no run_time available relevant_job_run_date = _timestamp_to_shortdate( relevant_job_run['run_time']) # A job_run is like MASTER.foo.1 job_run_id = relevant_job_run['id'] # A job action is like MASTER.foo.1.step1 actions_expected_runtime = job_content.get('actions_expected_runtime', {}) relevant_action = get_relevant_action( action_runs=relevant_job_run["runs"], last_state=last_state, actions_expected_runtime=actions_expected_runtime) action_run_id = get_object_type_from_identifier( url_index, relevant_action['id'], ) if last_state in (State.STUCK, State.FAILED, State.UNKNOWN): action_run_details = client.action_runs(action_run_id.url, num_lines=10) else: action_run_details = {} if last_state == State.SUCCEEDED: prefix = f"OK: The last job ({job_run_id}) run succeeded on {cluster}. Will watch future or in progress runs for the next failure" status = 0 stderr = "" elif last_state == State.NO_RUNS_TO_CHECK: prefix = f"OK: The job {job['name']} is new and/or has no runs to check on {cluster}" status = 0 stderr = "" elif last_state == State.SKIPPED: prefix = f"OK: The last job ({job_run_id}) run was skipped on {cluster}. Will watch future or in progress runs for the next failure" status = 0 stderr = "" elif last_state == State.STUCK: if job['monitoring'].get("page_for_expected_runtime", False): level = "CRIT" status = 2 else: level = "WARN" status = 1 prefix = f"{level}: Job {job_run_id} exceeded expected runtime or still running when next job is scheduled on {cluster}" stderr = '\n'.join( action_run_details.get('stderr', ["(No stderr available)"])) elif last_state == State.FAILED: prefix = f"CRIT: The last job run ({job_run_id}) failed on {cluster}!" status = 2 stderr = '\n'.join( action_run_details.get('stderr', ["(No stderr available)"])) elif last_state == State.UNKNOWN: prefix = f"CRIT: Job {job_run_id} has gone 'unknown' and might need manual intervention on {cluster}" status = 2 stderr = "" else: prefix = f"UNKNOWN: Job {job_run_id} is in a state that check_tron_jobs doesn't understand" status = 3 stderr = "" if hide_stderr: stderr = "" precious_runs_note = '' if job['monitoring'].get(PRECIOUS_JOB_ATTR, False) and status != 0: precious_runs_note = f"Note: This alert is the run for {relevant_job_run_date}. A resolve event will not occur until a job run for this date succeeds.\n" kwargs["output"] = ( f"{prefix}\n" f"{stderr}\n" f"The latest run, {relevant_job_run['id']} {relevant_job_run['state']}\n" f"{precious_runs_note}") if action_run_details: kwargs["output"] += ("\nHere is the last action:\n" f"{pretty_print_actions(action_run_details)}\n\n") kwargs["output"] += ("And the job run view:\n" f"{pretty_print_job_run(relevant_job_run)}\n\n" "Here is the whole job view for context:\n" f"{pretty_print_job(job_content)}") kwargs["status"] = status return kwargs
def test_get_url_from_identifier_job_no_namespace(self): identifier = get_object_type_from_identifier(self.index, 'namea') assert_equal(identifier.url, self.index['jobs']['MASTER.namea'] + '/') assert_equal(identifier.type, TronObjectType.job) assert_equal(identifier.name, 'MASTER.namea')
def compute_check_result_for_job_runs(client, job, job_content): url_index = client.index() kwargs = {} if job_content is None: kwargs["output"] = "OK: {} was just added and hasn't run yet.".format( job['name'], ) kwargs["status"] = 0 return kwargs relevant_job_run, last_state = get_relevant_run_and_state(job_content) if relevant_job_run is None: kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \ f"run yet.\n{pretty_print_job(job_content)}" kwargs["status"] = 2 return kwargs else: # if no run scheduled, no run_time available relevant_job_run_date = _timestamp_to_shortdate( relevant_job_run['run_time'] ) # A job_run is like MASTER.foo.1 job_run_id = get_object_type_from_identifier( url_index, relevant_job_run['id'], ) action_runs = client.job(job_run_id.url, include_action_runs=True) # A job action is like MASTER.foo.1.step1 actions_expected_runtime = job_content.get('actions_expected_runtime', {}) relevant_action = get_relevant_action( action_runs=action_runs["runs"], last_state=last_state, actions_expected_runtime=actions_expected_runtime ) action_run_id = get_object_type_from_identifier( url_index, relevant_action['id'], ) action_run_details = client.action_runs(action_run_id.url, num_lines=10) if last_state == State.SUCCEEDED: prefix = "OK: The last job run succeeded" status = 0 elif last_state == State.NO_RUNS_TO_CHECK: prefix = "OK: The job is 'new' and/or has no runs to check" status = 0 elif last_state == State.SKIPPED: prefix = "OK: The last job run was skipped" status = 0 elif last_state == State.STUCK: prefix = "WARN: Job exceeded expected runtime or still running when next job is scheduled" status = 1 elif last_state == State.FAILED: prefix = "CRIT: The last job run failed!" status = 2 elif last_state == State.UNKNOWN: prefix = "CRIT: Job has gone 'unknown' and might need manual intervention" status = 2 else: prefix = "UNKNOWN: The job is in a state that check_tron_jobs doesn't understand" status = 3 precious_runs_note = '' if job['monitoring'].get(PRECIOUS_JOB_ATTR, False) and status != 0: precious_runs_note = f"Note: This alert is the run for {relevant_job_run_date}. A resolve event will not occur until a job run for this date succeeds.\n" kwargs["output"] = ( f"{prefix}\n" f"{job['name']}'s latest run for {relevant_job_run_date} ({relevant_job_run['id']}) {relevant_job_run['state']}\n" f"{precious_runs_note}" "\nHere is the last action:\n" f"{pretty_print_actions(action_run_details)}\n\n" "And the job run view:\n" f"{pretty_print_job_run(relevant_job_run)}\n\n" "Here is the whole job view for context:\n" f"{pretty_print_job(job_content)}" ) kwargs["status"] = status return kwargs
def compute_check_result_for_job_runs(client, job, job_content): url_index = client.index() kwargs = {} if job_content is None: kwargs["output"] = "OK: {} was just added and hasn't run yet.".format( job['name'], ) kwargs["status"] = 0 return kwargs relevant_job_run, last_state = get_relevant_run_and_state(job_content) if relevant_job_run is None: kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \ f"run yet.\n{pretty_print_job(job_content)}" kwargs["status"] = 2 return kwargs # A job_run is like MASTER.foo.1 job_run_id = get_object_type_from_identifier( url_index, relevant_job_run['id'], ) action_runs = client.job(job_run_id.url, include_action_runs=True) # A job action is like MASTER.foo.1.step1 actions_expected_runtime = job_content.get('actions_expected_runtime', {}) relevant_action = get_relevant_action( action_runs=action_runs["runs"], last_state=last_state, actions_expected_runtime=actions_expected_runtime ) action_run_id = get_object_type_from_identifier( url_index, relevant_action['id'], ) action_run_details = client.action_runs(action_run_id.url, num_lines=10) if last_state == State.SUCCEEDED: prefix = "OK: The last job run succeeded" status = 0 elif last_state == State.WAITING_FOR_FIRST_RUN: prefix = "OK: The job is 'new' and waiting for the first run" status = 0 elif last_state == State.STUCK: prefix = "WARN: Job exceeded expected runtime or still running when next job is scheduled" status = 1 elif last_state == State.FAILED: prefix = "CRIT: The last job run failed!" status = 2 elif last_state == State.NOT_SCHEDULED: prefix = "CRIT: Job is not scheduled at all!" status = 2 elif last_state == State.UNKNOWN: prefix = "CRIT: Job has gone 'unknown' and might need manual intervention" status = 2 else: prefix = "UNKNOWN: The job is in a state that check_tron_jobs doesn't understand" status = 3 kwargs["output"] = ( "{}\n" "{}'s last relevant run (run {}) {}.\n\n" "Here is the last action:" "{}\n\n" "And the job run view:\n" "{}\n\n" "Here is the whole job view for context:\n" "{}" ).format( prefix, job['name'], relevant_job_run['id'], relevant_job_run['state'], pretty_print_actions(action_run_details), pretty_print_job_run(relevant_job_run), pretty_print_job(job_content), ) kwargs["status"] = status return kwargs
def test_get_url_from_identifier_job_no_namespace_not_master(self): identifier = get_object_type_from_identifier(self.index, 'nameg') assert_equal(identifier.url, '/api/jobs/OTHER.nameg') assert_equal(identifier.type, TronObjectType.job)
def test_get_url_from_identifier_service(self): identifier = get_object_type_from_identifier(self.index, 'MASTER.foo') assert_equal(identifier.url, self.index['services']['MASTER.foo'] + '/') assert_equal(identifier.type, TronObjectType.service)
def test_get_url_from_identifier_job_run(self): identifier = get_object_type_from_identifier(self.index, 'MASTER.nameb.7') assert_equal(identifier.url, self.index['jobs']['MASTER.nameb'] + '/7') assert_equal(identifier.type, TronObjectType.job_run)
def compute_check_result_for_job_runs(client, job, job_content): url_index = client.index() kwargs = {} if job_content is None: kwargs["output"] = "OK: {} was just added and hasn't run yet.".format( job['name'], ) kwargs["status"] = 0 return kwargs relevant_job_run, last_state = get_relevant_run_and_state(job_content) if relevant_job_run is None: kwargs[ "output"] = "CRIT: {} hasn't had a successful run yet.\n{}".format( job['name'], pretty_print_job(job_content), ) kwargs["status"] = 2 return kwargs # A job_run is like MASTER.foo.1 job_run_id = get_object_type_from_identifier( url_index, relevant_job_run['id'], ) action_runs = client.job(job_run_id.url, include_action_runs=True) # A job action is like MASTER.foo.1.step1 relevant_action = get_relevant_action(action_runs["runs"], last_state) action_run_id = get_object_type_from_identifier( url_index, relevant_action['id'], ) action_run_details = client.action_runs(action_run_id.url, num_lines=10) if last_state == State.SUCCEEDED or last_state == State.WAITING_FOR_FIRST_RUN: prefix = "OK" annotation = "" status = 0 elif last_state == State.STUCK: prefix = "WARN" annotation = "Job still running when next job is scheduled to run (stuck?)" status = 1 elif last_state == State.FAILED: prefix = "CRIT" annotation = "" status = 2 elif last_state == State.NOT_SCHEDULED: prefix = "CRIT" annotation = "Job is not scheduled at all" status = 2 else: prefix = "UNKNOWN" annotation = "" status = 3 kwargs["output"] = ("{}: {}\n" "{}'s last relevant run (run {}) {}.\n\n" "Here is the last action:" "{}\n\n" "And the job run view:\n" "{}\n\n" "Here is the whole job view for context:\n" "{}").format( prefix, annotation, job['name'], relevant_job_run['id'], relevant_job_run['state'], pretty_print_actions(action_run_details), pretty_print_job_run(relevant_job_run), pretty_print_job(job_content), ) kwargs["status"] = status return kwargs