Beispiel #1
0
 def test_get_url_from_identifier_job(self):
     identifier = get_object_type_from_identifier(
         self.index,
         'MASTER.namea',
     )
     assert_equal(identifier.url, '/api/jobs/MASTER.namea')
     assert_equal(identifier.type, TronObjectType.job)
Beispiel #2
0
 def test_get_url_from_identifier_action_run(self):
     identifier = get_object_type_from_identifier(
         self.index,
         'MASTER.nameb.7.run',
     )
     assert_equal(identifier.url, '/api/jobs/MASTER.nameb/7/run')
     assert_equal(identifier.type, TronObjectType.action_run)
Beispiel #3
0
def compute_check_result_for_job(client, job):
    kwargs = {
        "name": "check_tron_job.{}".format(job['name']),
        "source": "tron",
    }
    kwargs.update(job['monitoring'])
    if 'realert_every' not in kwargs:
        kwargs['realert_every'] = guess_realert_every(job)
    kwargs['check_every'] = "{}s".format(_run_interval)

    status = job["status"]
    if status == "disabled":
        kwargs["output"] = "OK: {} is disabled and won't be checked.".format(
            job['name'],
        )
        kwargs["status"] = 0
        log.info(kwargs["output"])
        return kwargs
    else:
        # The job is not disabled, therefore we have to look at its run history
        url_index = client.index()
        tron_id = get_object_type_from_identifier(url_index, job["name"])
        job_content = client.job(
            tron_id.url,
            count=20,
            include_action_runs=True,
        )
        results = compute_check_result_for_job_runs(
            job=job,
            job_content=job_content,
            client=client,
        )
        kwargs.update(results)
        log.info(kwargs["output"].split("\n")[0])
        return kwargs
Beispiel #4
0
 def test_get_url_from_identifier_job(self):
     identifier = get_object_type_from_identifier(
         self.index,
         'MASTER.namea',
     )
     assert_equal(identifier.url, '/api/jobs/MASTER.namea')
     assert_equal(identifier.type, TronObjectType.job)
Beispiel #5
0
 def test_get_url_from_identifier_action_run(self):
     identifier = get_object_type_from_identifier(
         self.index,
         'MASTER.nameb.7.run',
     )
     assert_equal(identifier.url, '/api/jobs/MASTER.nameb/7/run')
     assert_equal(identifier.type, TronObjectType.action_run)
Beispiel #6
0
def compute_check_result_for_job(client, job):
    kwargs = m(
        name="check_tron_job.{}".format(job['name']),
        source="tron",
    )
    if 'realert_every' not in kwargs:
        kwargs = kwargs.set('realert_every', guess_realert_every(job))
    kwargs = kwargs.set('check_every', f"{_run_interval}s")

    # We want to prevent a monitoring config from setting the check_every
    # attribute, since one config should not dictate how often this script runs
    sensu_kwargs = (
        pmap(job['monitoring']).discard(PRECIOUS_JOB_ATTR)
        .discard('check_every')
    )
    kwargs = kwargs.update(sensu_kwargs)

    kwargs_list = []
    if job["status"] == "disabled":
        kwargs = kwargs.set(
            'output',
            "OK: {} is disabled and won't be checked.".format(job['name'], )
        )
        kwargs = kwargs.set('status', 0)
        kwargs_list.append(kwargs)
    else:
        # The job is not disabled, therefore we have to look at its run history
        url_index = client.index()
        tron_id = get_object_type_from_identifier(url_index, job["name"])
        job_content = pmap(
            client.job(
                tron_id.url,
                include_action_runs=True,
            )
        )

        if job['monitoring'].get(PRECIOUS_JOB_ATTR, False):
            dated_runs = sort_runs_by_interval(job_content, interval='day')
        else:
            dated_runs = {'': job_content['runs']}

        for date, runs in dated_runs.items():
            results = compute_check_result_for_job_runs(
                job=job,
                job_content=job_content.set('runs', runs),
                client=client,
            )
            dated_kwargs = kwargs.update(results)
            if date:  # if empty date, leave job name alone
                dated_kwargs = dated_kwargs.set(
                    'name', f"{kwargs['name']}-{date}"
                )
            kwargs_list.append(dated_kwargs)

    return [dict(kws) for kws in kwargs_list]
Beispiel #7
0
 def test_get_url_from_identifier_service_instance(self):
     identifier = get_object_type_from_identifier(self.index, 'MASTER.foo.1')
     assert_equal(identifier.url, '/api/services/MASTER.foo/1')
     assert_equal(identifier.type, TronObjectType.service_instance)
Beispiel #8
0
 def test_get_url_from_identifier_service_no_namespace(self):
     identifier = get_object_type_from_identifier(self.index, 'foo')
     assert_equal(identifier.url, '/api/services/MASTER.foo')
     assert_equal(identifier.type, TronObjectType.service)
def compute_check_result_for_job_runs(client,
                                      job,
                                      job_content,
                                      url_index,
                                      hide_stderr=False):
    cluster = client.cluster_name
    kwargs = {}
    if job_content is None:
        kwargs[
            "output"] = f"OK: {job['name']} was just added and hasn't run yet on {cluster}."
        kwargs["status"] = 0
        return kwargs

    relevant_job_run, last_state = get_relevant_run_and_state(job_content)
    if relevant_job_run is None:
        kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \
            f"run yet on {cluster}.\n{pretty_print_job(job_content)}"
        kwargs["status"] = 2
        return kwargs
    else:  # if no run scheduled, no run_time available
        relevant_job_run_date = _timestamp_to_shortdate(
            relevant_job_run['run_time'])

    # A job_run is like MASTER.foo.1
    job_run_id = relevant_job_run['id']

    # A job action is like MASTER.foo.1.step1
    actions_expected_runtime = job_content.get('actions_expected_runtime', {})
    relevant_action = get_relevant_action(
        action_runs=relevant_job_run["runs"],
        last_state=last_state,
        actions_expected_runtime=actions_expected_runtime)
    action_run_id = get_object_type_from_identifier(
        url_index,
        relevant_action['id'],
    )

    if last_state in (State.STUCK, State.FAILED, State.UNKNOWN):
        action_run_details = client.action_runs(action_run_id.url,
                                                num_lines=10)
    else:
        action_run_details = {}

    if last_state == State.SUCCEEDED:
        prefix = f"OK: The last job ({job_run_id}) run succeeded on {cluster}. Will watch future or in progress runs for the next failure"
        status = 0
        stderr = ""
    elif last_state == State.NO_RUNS_TO_CHECK:
        prefix = f"OK: The job {job['name']} is new and/or has no runs to check on {cluster}"
        status = 0
        stderr = ""
    elif last_state == State.SKIPPED:
        prefix = f"OK: The last job ({job_run_id}) run was skipped on {cluster}. Will watch future or in progress runs for the next failure"
        status = 0
        stderr = ""
    elif last_state == State.STUCK:
        if job['monitoring'].get("page_for_expected_runtime", False):
            level = "CRIT"
            status = 2
        else:
            level = "WARN"
            status = 1
        prefix = f"{level}: Job {job_run_id} exceeded expected runtime or still running when next job is scheduled on {cluster}"
        stderr = '\n'.join(
            action_run_details.get('stderr', ["(No stderr available)"]))
    elif last_state == State.FAILED:
        prefix = f"CRIT: The last job run ({job_run_id}) failed on {cluster}!"
        status = 2
        stderr = '\n'.join(
            action_run_details.get('stderr', ["(No stderr available)"]))
    elif last_state == State.UNKNOWN:
        prefix = f"CRIT: Job {job_run_id} has gone 'unknown' and might need manual intervention on {cluster}"
        status = 2
        stderr = ""
    else:
        prefix = f"UNKNOWN: Job {job_run_id} is in a state that check_tron_jobs doesn't understand"
        status = 3
        stderr = ""

    if hide_stderr:
        stderr = ""

    precious_runs_note = ''
    if job['monitoring'].get(PRECIOUS_JOB_ATTR, False) and status != 0:
        precious_runs_note = f"Note: This alert is the run for {relevant_job_run_date}. A resolve event will not occur until a job run for this date succeeds.\n"

    kwargs["output"] = (
        f"{prefix}\n"
        f"{stderr}\n"
        f"The latest run, {relevant_job_run['id']} {relevant_job_run['state']}\n"
        f"{precious_runs_note}")
    if action_run_details:
        kwargs["output"] += ("\nHere is the last action:\n"
                             f"{pretty_print_actions(action_run_details)}\n\n")
    kwargs["output"] += ("And the job run view:\n"
                         f"{pretty_print_job_run(relevant_job_run)}\n\n"
                         "Here is the whole job view for context:\n"
                         f"{pretty_print_job(job_content)}")
    kwargs["status"] = status
    return kwargs
Beispiel #10
0
 def test_get_url_from_identifier_service_instance(self):
     identifier = get_object_type_from_identifier(self.index, 'MASTER.foo.1')
     assert_equal(identifier.url, '/api/services/MASTER.foo/1')
     assert_equal(identifier.type, TronObjectType.service_instance)
Beispiel #11
0
 def test_get_url_from_identifier_job_no_namespace(self):
     identifier = get_object_type_from_identifier(self.index, 'namea')
     assert_equal(identifier.url, self.index['jobs']['MASTER.namea'] + '/')
     assert_equal(identifier.type, TronObjectType.job)
     assert_equal(identifier.name, 'MASTER.namea')
Beispiel #12
0
def compute_check_result_for_job_runs(client, job, job_content):
    url_index = client.index()
    kwargs = {}
    if job_content is None:
        kwargs["output"] = "OK: {} was just added and hasn't run yet.".format(
            job['name'],
        )
        kwargs["status"] = 0
        return kwargs

    relevant_job_run, last_state = get_relevant_run_and_state(job_content)
    if relevant_job_run is None:
        kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \
            f"run yet.\n{pretty_print_job(job_content)}"
        kwargs["status"] = 2
        return kwargs
    else:  # if no run scheduled, no run_time available
        relevant_job_run_date = _timestamp_to_shortdate(
            relevant_job_run['run_time']
        )

    # A job_run is like MASTER.foo.1
    job_run_id = get_object_type_from_identifier(
        url_index,
        relevant_job_run['id'],
    )
    action_runs = client.job(job_run_id.url, include_action_runs=True)
    # A job action is like MASTER.foo.1.step1
    actions_expected_runtime = job_content.get('actions_expected_runtime', {})
    relevant_action = get_relevant_action(
        action_runs=action_runs["runs"],
        last_state=last_state,
        actions_expected_runtime=actions_expected_runtime
    )
    action_run_id = get_object_type_from_identifier(
        url_index,
        relevant_action['id'],
    )
    action_run_details = client.action_runs(action_run_id.url, num_lines=10)

    if last_state == State.SUCCEEDED:
        prefix = "OK: The last job run succeeded"
        status = 0
    elif last_state == State.NO_RUNS_TO_CHECK:
        prefix = "OK: The job is 'new' and/or has no runs to check"
        status = 0
    elif last_state == State.SKIPPED:
        prefix = "OK: The last job run was skipped"
        status = 0
    elif last_state == State.STUCK:
        prefix = "WARN: Job exceeded expected runtime or still running when next job is scheduled"
        status = 1
    elif last_state == State.FAILED:
        prefix = "CRIT: The last job run failed!"
        status = 2
    elif last_state == State.UNKNOWN:
        prefix = "CRIT: Job has gone 'unknown' and might need manual intervention"
        status = 2
    else:
        prefix = "UNKNOWN: The job is in a state that check_tron_jobs doesn't understand"
        status = 3

    precious_runs_note = ''
    if job['monitoring'].get(PRECIOUS_JOB_ATTR, False) and status != 0:
        precious_runs_note = f"Note: This alert is the run for {relevant_job_run_date}. A resolve event will not occur until a job run for this date succeeds.\n"

    kwargs["output"] = (
        f"{prefix}\n"
        f"{job['name']}'s latest run for {relevant_job_run_date} ({relevant_job_run['id']}) {relevant_job_run['state']}\n"
        f"{precious_runs_note}"
        "\nHere is the last action:\n"
        f"{pretty_print_actions(action_run_details)}\n\n"
        "And the job run view:\n"
        f"{pretty_print_job_run(relevant_job_run)}\n\n"
        "Here is the whole job view for context:\n"
        f"{pretty_print_job(job_content)}"
    )
    kwargs["status"] = status
    return kwargs
Beispiel #13
0
def compute_check_result_for_job_runs(client, job, job_content):
    url_index = client.index()
    kwargs = {}
    if job_content is None:
        kwargs["output"] = "OK: {} was just added and hasn't run yet.".format(
            job['name'],
        )
        kwargs["status"] = 0
        return kwargs

    relevant_job_run, last_state = get_relevant_run_and_state(job_content)
    if relevant_job_run is None:
        kwargs["output"] = f"CRIT: {job['name']} hasn't had a successful " \
            f"run yet.\n{pretty_print_job(job_content)}"
        kwargs["status"] = 2
        return kwargs

    # A job_run is like MASTER.foo.1
    job_run_id = get_object_type_from_identifier(
        url_index,
        relevant_job_run['id'],
    )
    action_runs = client.job(job_run_id.url, include_action_runs=True)
    # A job action is like MASTER.foo.1.step1
    actions_expected_runtime = job_content.get('actions_expected_runtime', {})
    relevant_action = get_relevant_action(
        action_runs=action_runs["runs"],
        last_state=last_state,
        actions_expected_runtime=actions_expected_runtime
    )
    action_run_id = get_object_type_from_identifier(
        url_index,
        relevant_action['id'],
    )
    action_run_details = client.action_runs(action_run_id.url, num_lines=10)

    if last_state == State.SUCCEEDED:
        prefix = "OK: The last job run succeeded"
        status = 0
    elif last_state == State.WAITING_FOR_FIRST_RUN:
        prefix = "OK: The job is 'new' and waiting for the first run"
        status = 0
    elif last_state == State.STUCK:
        prefix = "WARN: Job exceeded expected runtime or still running when next job is scheduled"
        status = 1
    elif last_state == State.FAILED:
        prefix = "CRIT: The last job run failed!"
        status = 2
    elif last_state == State.NOT_SCHEDULED:
        prefix = "CRIT: Job is not scheduled at all!"
        status = 2
    elif last_state == State.UNKNOWN:
        prefix = "CRIT: Job has gone 'unknown' and might need manual intervention"
        status = 2
    else:
        prefix = "UNKNOWN: The job is in a state that check_tron_jobs doesn't understand"
        status = 3

    kwargs["output"] = (
        "{}\n"
        "{}'s last relevant run (run {}) {}.\n\n"
        "Here is the last action:"
        "{}\n\n"
        "And the job run view:\n"
        "{}\n\n"
        "Here is the whole job view for context:\n"
        "{}"
    ).format(
        prefix,
        job['name'],
        relevant_job_run['id'],
        relevant_job_run['state'],
        pretty_print_actions(action_run_details),
        pretty_print_job_run(relevant_job_run),
        pretty_print_job(job_content),
    )
    kwargs["status"] = status
    return kwargs
Beispiel #14
0
 def test_get_url_from_identifier_job_no_namespace_not_master(self):
     identifier = get_object_type_from_identifier(self.index, 'nameg')
     assert_equal(identifier.url, '/api/jobs/OTHER.nameg')
     assert_equal(identifier.type, TronObjectType.job)
Beispiel #15
0
 def test_get_url_from_identifier_service(self):
     identifier = get_object_type_from_identifier(self.index, 'MASTER.foo')
     assert_equal(identifier.url, self.index['services']['MASTER.foo'] + '/')
     assert_equal(identifier.type, TronObjectType.service)
Beispiel #16
0
 def test_get_url_from_identifier_service_no_namespace(self):
     identifier = get_object_type_from_identifier(self.index, 'foo')
     assert_equal(identifier.url, '/api/services/MASTER.foo')
     assert_equal(identifier.type, TronObjectType.service)
Beispiel #17
0
 def test_get_url_from_identifier_job_run(self):
     identifier = get_object_type_from_identifier(self.index, 'MASTER.nameb.7')
     assert_equal(identifier.url, self.index['jobs']['MASTER.nameb'] + '/7')
     assert_equal(identifier.type, TronObjectType.job_run)
Beispiel #18
0
 def test_get_url_from_identifier_job_no_namespace_not_master(self):
     identifier = get_object_type_from_identifier(self.index, 'nameg')
     assert_equal(identifier.url, '/api/jobs/OTHER.nameg')
     assert_equal(identifier.type, TronObjectType.job)
Beispiel #19
0
def compute_check_result_for_job_runs(client, job, job_content):
    url_index = client.index()
    kwargs = {}
    if job_content is None:
        kwargs["output"] = "OK: {} was just added and hasn't run yet.".format(
            job['name'], )
        kwargs["status"] = 0
        return kwargs

    relevant_job_run, last_state = get_relevant_run_and_state(job_content)
    if relevant_job_run is None:
        kwargs[
            "output"] = "CRIT: {} hasn't had a successful run yet.\n{}".format(
                job['name'],
                pretty_print_job(job_content),
            )
        kwargs["status"] = 2
        return kwargs

    # A job_run is like MASTER.foo.1
    job_run_id = get_object_type_from_identifier(
        url_index,
        relevant_job_run['id'],
    )
    action_runs = client.job(job_run_id.url, include_action_runs=True)
    # A job action is like MASTER.foo.1.step1
    relevant_action = get_relevant_action(action_runs["runs"], last_state)
    action_run_id = get_object_type_from_identifier(
        url_index,
        relevant_action['id'],
    )
    action_run_details = client.action_runs(action_run_id.url, num_lines=10)

    if last_state == State.SUCCEEDED or last_state == State.WAITING_FOR_FIRST_RUN:
        prefix = "OK"
        annotation = ""
        status = 0
    elif last_state == State.STUCK:
        prefix = "WARN"
        annotation = "Job still running when next job is scheduled to run (stuck?)"
        status = 1
    elif last_state == State.FAILED:
        prefix = "CRIT"
        annotation = ""
        status = 2
    elif last_state == State.NOT_SCHEDULED:
        prefix = "CRIT"
        annotation = "Job is not scheduled at all"
        status = 2
    else:
        prefix = "UNKNOWN"
        annotation = ""
        status = 3

    kwargs["output"] = ("{}: {}\n"
                        "{}'s last relevant run (run {}) {}.\n\n"
                        "Here is the last action:"
                        "{}\n\n"
                        "And the job run view:\n"
                        "{}\n\n"
                        "Here is the whole job view for context:\n"
                        "{}").format(
                            prefix,
                            annotation,
                            job['name'],
                            relevant_job_run['id'],
                            relevant_job_run['state'],
                            pretty_print_actions(action_run_details),
                            pretty_print_job_run(relevant_job_run),
                            pretty_print_job(job_content),
                        )
    kwargs["status"] = status
    return kwargs