Example #1
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir)

    try:
        service_job_mapping = build_service_job_mapping(client, configured_jobs)

        for service_instance, chronos_job in service_job_mapping.items():
            service, instance = service_instance[0], service_instance[1]
            try:
                chronos_job_config = load_chronos_job_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
            except utils.NoDeploymentsAvailable:
                log.info("Skipping %s because no deployments are available" % service)
                continue
            sensu_output, sensu_status = sensu_message_status_for_jobs(
                chronos_job_config=chronos_job_config,
                chronos_job=chronos_job,
                client=client,
            )
            if sensu_status is not None:
                send_event(chronos_job_config, sensu_status, sensu_output)
    except (chronos.ChronosAPIError) as e:
        log.error("CRITICAL: Unable to contact Chronos! Error: %s" % e)
        sys.exit(2)
Example #2
0
def main(args):
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()

    # get those jobs listed in configs
    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)

    service_job_mapping = build_service_job_mapping(client, configured_jobs)
    for service_instance, job_state_pairs in service_job_mapping.items():
        service, instance = service_instance[0], service_instance[1]
        sensu_output, sensu_status = sensu_message_status_for_jobs(service, instance, job_state_pairs)
        monitoring_overrides = compose_monitoring_overrides_for_service(
            cluster=system_paasta_config.get_cluster(),
            service=service,
            instance=instance,
            soa_dir=args.soa_dir
        )
        send_event_to_sensu(
            service=service,
            instance=instance,
            monitoring_overrides=monitoring_overrides,
            status_code=sensu_status,
            message=sensu_output,
            soa_dir=args.soa_dir,
        )
Example #3
0
def main(args):
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()

    # get those jobs listed in configs
    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(
        soa_dir=args.soa_dir)

    service_job_mapping = build_service_job_mapping(client, configured_jobs)
    for service_instance, job_state_pairs in service_job_mapping.items():
        service, instance = service_instance[0], service_instance[1]
        sensu_output, sensu_status = sensu_message_status_for_jobs(
            service, instance, job_state_pairs)
        monitoring_overrides = compose_monitoring_overrides_for_service(
            cluster=system_paasta_config.get_cluster(),
            service=service,
            instance=instance,
            soa_dir=args.soa_dir)
        send_event_to_sensu(
            service=service,
            instance=instance,
            monitoring_overrides=monitoring_overrides,
            status_code=sensu_status,
            message=sensu_output,
            soa_dir=args.soa_dir,
        )
Example #4
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose,
                                     humanize_output=args.humanize)

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [('marathon is not configured to run here', True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except ChronosNotConfigured:
        chronos_results = [('chronos is not configured to run here', True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose)
    print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not all([mesos_ok, marathon_ok, chronos_ok]):
        sys.exit(2)
    else:
        sys.exit(0)
Example #5
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    mesos_results = get_mesos_status(mesos_state, verbosity=args.verbose,
                                     humanize_output=args.humanize)

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [('marathon is not configured to run here', True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except ChronosNotConfigured:
        chronos_results = [('chronos is not configured to run here', True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    print_results_for_healthchecks(mesos_summary, mesos_ok, mesos_results, args.verbose)
    print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not all([mesos_ok, marathon_ok, chronos_ok]):
        sys.exit(2)
    else:
        sys.exit(0)
Example #6
0
def main():

    args = parse_args()

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    # get_chronos_jobs_for_cluster returns (service, job)
    expected_service_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)

    # filter jobs not related to paasta
    # and decompose into (service, instance, tag) tuples
    paasta_jobs = filter_paasta_jobs(deployed_job_names(client))
    running_service_jobs = [chronos_tools.decompose_job_id(job) for job in paasta_jobs]

    to_delete = jobs_to_delete(expected_service_jobs, running_service_jobs)

    # recompose the job ids again for deletion
    to_delete_job_ids = [chronos_tools.compose_job_id(*job) for job in to_delete]

    task_responses = cleanup_tasks(client, to_delete_job_ids)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete_job_ids)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)

    if len(to_delete) == 0:
        print 'No Chronos Jobs to remove'
    else:
        if len(task_successes) > 0:
            print format_list_output("Successfully Removed Tasks (if any were running) for:",
                                     [job[0] for job in task_successes])

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])

        if len(job_successes) > 0:
            print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)
Example #7
0
def chronos_instance_status(instance_status, service, instance, verbose):
    cstatus = {}
    chronos_config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(chronos_config)
    job_config = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=settings.cluster,
        soa_dir=settings.soa_dir,
    )
    cstatus['desired_state'] = job_config.get_desired_state()
    job_type = chronos_tools.get_job_type(job_config.config_dict)
    if job_type == chronos_tools.JobType.Scheduled:
        schedule_type = 'schedule'
        schedule = job_config.get_schedule()
        epsilon = job_config.get_epsilon()
        time_zone = job_config.get_schedule_time_zone()
        if time_zone == 'null' or time_zone is None:
            time_zone = 'UTC'
        cstatus['schedule'] = {}
        cstatus['schedule']['schedule'] = schedule
        cstatus['schedule']['epsilon'] = epsilon
        cstatus['schedule']['time_zone'] = time_zone
    elif job_type == chronos_tools.JobType.Dependent:
        schedule_type = 'parents'
        parents = job_config.get_parents()
        cstatus['parents'] = parents
    else:
        schedule_type = 'unknown'
    cstatus['schedule_type'] = schedule_type
    cstatus['status'] = {}
    if verbose:
        running_task_count = len(
            select_tasks_by_id(
                a_sync.block(get_cached_list_of_running_tasks_from_frameworks),
                job_config.get_job_name(),
            ),
        )
        cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running'
    cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled'
    cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance)
    cstatus['command'] = job_config.get_cmd()
    last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict)
    if last_status == chronos_tools.LastRunState.Success:
        last_status = 'success'
    elif last_status == chronos_tools.LastRunState.Fail:
        last_status = 'fail'
    elif last_status == chronos_tools.LastRunState.NotRun:
        last_status = 'not_run'
    else:
        last_status = ''
    if last_status == 'not_run' or last_status == '':
        last_time = 'never'
    cstatus['last_status'] = {}
    cstatus['last_status']['result'] = last_status
    cstatus['last_status']['time'] = last_time

    return cstatus
Example #8
0
def main():

    args = parse_args()

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    # get_chronos_jobs_for_cluster returns (service, job)
    expected_service_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)

    # filter jobs not related to paasta
    # and decompose into (service, instance, tag) tuples
    paasta_jobs = filter_paasta_jobs(deployed_job_names(client))
    running_service_jobs = [chronos_tools.decompose_job_id(job) for job in paasta_jobs]

    to_delete = jobs_to_delete(expected_service_jobs, running_service_jobs)

    # recompose the job ids again for deletion
    to_delete_job_ids = [chronos_tools.compose_job_id(*job) for job in to_delete]

    task_responses = cleanup_tasks(client, to_delete_job_ids)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete_job_ids)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)

    if len(to_delete) == 0:
        print 'No Chronos Jobs to remove'
    else:
        if len(task_successes) > 0:
            print format_list_output("Successfully Removed Tasks (if any were running) for:",
                                     [job[0] for job in task_successes])

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])

        if len(job_successes) > 0:
            print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)
Example #9
0
def chronos_instance_status(service: str, instance: str,
                            verbose: int) -> Dict[str, Any]:
    chronos_config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(chronos_config)
    return {
        "output":
        chronos_serviceinit.status_chronos_jobs(client, service, instance,
                                                settings.cluster,
                                                settings.soa_dir, verbose)
    }
Example #10
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(
        cluster, soa_dir=soa_dir)

    try:
        service_job_mapping = build_service_job_mapping(
            client, configured_jobs)

        for service_instance, job_state_pairs in service_job_mapping.items():
            service, instance = service_instance[0], service_instance[1]
            try:
                chronos_job_config = load_chronos_job_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
            except utils.NoDeploymentsAvailable:
                paasta_print(
                    utils.PaastaColors.cyan(
                        "Skipping %s because no deployments are available" %
                        service))
                continue
            sensu_output, sensu_status = sensu_message_status_for_jobs(
                chronos_job_config=chronos_job_config,
                service=service,
                instance=instance,
                cluster=cluster,
                job_state_pairs=job_state_pairs)
            if sensu_status is not None:
                monitoring_overrides = compose_monitoring_overrides_for_service(
                    chronos_job_config=chronos_job_config, soa_dir=soa_dir)
                send_event(
                    service=service,
                    instance=instance,
                    monitoring_overrides=monitoring_overrides,
                    status_code=sensu_status,
                    message=sensu_output,
                    soa_dir=soa_dir,
                )
    except (chronos.ChronosAPIError) as e:
        paasta_print(
            utils.PaastaColors.red(
                "CRITICAL: Unable to contact Chronos! Error: %s" % e))
        sys.exit(2)
Example #11
0
def main():
    args = parse_args()

    cluster = load_system_paasta_config().get_cluster()

    service, instance = chronos_tools.decompose_job_id(args.service_instance)

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = load_system_paasta_config()

    chronos_job_config = chronos_tools.load_chronos_job_config(
        service,
        instance,
        system_paasta_config.get_cluster(),
        soa_dir=args.soa_dir)

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=args.soa_dir,
        )

    except (NoDeploymentsAvailable, NoDockerImageError) as e:
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance, cluster)
        print error_msg
        raise e
    except chronos_tools.UnknownChronosJobError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n"
            % (args.service_instance, cluster) + "Error was: %s" % str(e))
        print error_msg
        raise e
    except chronos_tools.InvalidParentError as e:
        raise e

    # complete_job_config is a formatted version
    # of the job, so the command is fornatted in the context
    # of 'now'
    # replace it with the 'original' cmd so it can be
    # re rendered
    original_command = chronos_job_config.get_cmd()
    complete_job_config['command'] = original_command
    clone = clone_job(
        complete_job_config,
        datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"))
    client.add(clone)
Example #12
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir)

    try:
        service_job_mapping = build_service_job_mapping(client, configured_jobs)

        for service_instance, job_state_pairs in service_job_mapping.items():
            service, instance = service_instance[0], service_instance[1]
            try:
                chronos_job_config = load_chronos_job_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
            except utils.NoDeploymentsAvailable:
                paasta_print(utils.PaastaColors.cyan("Skipping %s because no deployments are available" % service))
                continue
            sensu_output, sensu_status = sensu_message_status_for_jobs(
                chronos_job_config=chronos_job_config,
                service=service,
                instance=instance,
                cluster=cluster,
                job_state_pairs=job_state_pairs
            )
            if sensu_status is not None:
                monitoring_overrides = compose_monitoring_overrides_for_service(
                    chronos_job_config=chronos_job_config,
                    soa_dir=soa_dir
                )
                send_event(
                    service=service,
                    instance=instance,
                    monitoring_overrides=monitoring_overrides,
                    status_code=sensu_status,
                    message=sensu_output,
                    soa_dir=soa_dir,
                )
    except (chronos.ChronosAPIError) as e:
        paasta_print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
        sys.exit(2)
Example #13
0
def main():
    args = parse_args()

    cluster = load_system_paasta_config().get_cluster()

    service, instance = chronos_tools.decompose_job_id(args.service_instance)

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = load_system_paasta_config()

    chronos_job_config = chronos_tools.load_chronos_job_config(
        service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir)

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=args.soa_dir,
        )

    except (NoDeploymentsAvailable, NoDockerImageError) as e:
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance, cluster)
        print error_msg
        raise e
    except chronos_tools.UnknownChronosJobError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) +
            "Error was: %s" % str(e))
        print error_msg
        raise e
    except chronos_tools.InvalidParentError as e:
        raise e

    # complete_job_config is a formatted version
    # of the job, so the command is fornatted in the context
    # of 'now'
    # replace it with the 'original' cmd so it can be
    # re rendered
    original_command = chronos_job_config.get_cmd()
    complete_job_config['command'] = original_command
    clone = clone_job(complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"))
    client.add(clone)
def check_chronos_jobs():
    config = load_chronos_config()
    if not config:
        paasta_print("UNKNOWN: Failed to load chronos config")
        sys.exit(3)
    client = get_chronos_client(config)

    try:
        result = assert_chronos_scheduled_jobs(client)
    except (ChronosAPIError) as e:
        paasta_print("CRITICAL: Unable to connect to Chronos: %s" % e.message)
        sys.exit(2)

    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print("CRITICAL: " + result.message)
        sys.exit(2)
Example #15
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir)

    try:
        service_job_mapping = build_service_job_mapping(client, configured_jobs)

        for service_instance, job_state_pairs in service_job_mapping.items():
            service, instance = service_instance[0], service_instance[1]
            chronos_job_config = load_chronos_job_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
            sensu_output, sensu_status = sensu_message_status_for_jobs(
                chronos_job_config=chronos_job_config,
                service=service,
                instance=instance,
                cluster=cluster,
                job_state_pairs=job_state_pairs
            )
            monitoring_overrides = compose_monitoring_overrides_for_service(
                chronos_job_config=chronos_job_config,
                soa_dir=soa_dir
            )
            send_event(
                service=service,
                instance=instance,
                monitoring_overrides=monitoring_overrides,
                status_code=sensu_status,
                message=sensu_output,
                soa_dir=soa_dir,
            )
    except (ServerNotFoundError, chronos.ChronosAPIError, socket_error) as e:
        print(utils.PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
        sys.exit(2)
Example #16
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(cluster, soa_dir=soa_dir)

    service_job_mapping = build_service_job_mapping(client, configured_jobs)
    for service_instance, job_state_pairs in service_job_mapping.items():
        service, instance = service_instance[0], service_instance[1]
        chronos_job_config = load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
        sensu_output, sensu_status = sensu_message_status_for_jobs(
            chronos_job_config=chronos_job_config,
            service=service,
            instance=instance,
            cluster=cluster,
            job_state_pairs=job_state_pairs
        )
        monitoring_overrides = compose_monitoring_overrides_for_service(
            chronos_job_config=chronos_job_config,
            soa_dir=soa_dir
        )
        send_event(
            service=service,
            instance=instance,
            monitoring_overrides=monitoring_overrides,
            status_code=sensu_status,
            message=sensu_output,
            soa_dir=soa_dir,
        )
Example #17
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    configured_jobs = chronos_tools.get_chronos_jobs_for_cluster(
        cluster, soa_dir=soa_dir)

    service_job_mapping = build_service_job_mapping(client, configured_jobs)
    for service_instance, job_state_pairs in service_job_mapping.items():
        service, instance = service_instance[0], service_instance[1]
        chronos_job_config = load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
        sensu_output, sensu_status = sensu_message_status_for_jobs(
            chronos_job_config=chronos_job_config,
            service=service,
            instance=instance,
            cluster=cluster,
            job_state_pairs=job_state_pairs)
        monitoring_overrides = compose_monitoring_overrides_for_service(
            chronos_job_config=chronos_job_config, soa_dir=soa_dir)
        send_event(
            service=service,
            instance=instance,
            monitoring_overrides=monitoring_overrides,
            status_code=sensu_status,
            message=sensu_output,
            soa_dir=soa_dir,
        )
Example #18
0
def main():
    args = parse_args()

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    service, instance = chronos_tools.decompose_job_id(args.service_instance)

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    related_jobs = chronos_tools.get_related_jobs_configs(cluster,
                                                          service,
                                                          instance,
                                                          soa_dir=args.soa_dir)
    if not related_jobs:
        error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format(
            args.service_instance,
            cluster,
        )
        paasta_print(error_msg)
        raise NoDeploymentsAvailable

    if not args.run_all_related_jobs:
        # Strip all the configuration for the related services
        # those information will not be used by the rest of the flow
        related_jobs = {
            (service, instance): related_jobs[(service, instance)],
        }

    complete_job_configs = {}
    for (srv, inst) in related_jobs:
        try:
            complete_job_configs.update(
                {
                    (srv, inst):
                    chronos_tools.create_complete_config(
                        service=srv,
                        job_name=inst,
                        soa_dir=args.soa_dir,
                    ),
                }, )
        except (NoDeploymentsAvailable, NoDockerImageError) as e:
            error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format(
                chronos_tools.compose_job_id(srv, inst),
                cluster,
            )
            paasta_print(error_msg)
            raise e
        except NoConfigurationForServiceError as e:
            error_msg = (
                "Could not read chronos configuration file for {} in cluster {}\nError was: {}"
                .format(
                    chronos_tools.compose_job_id(srv, inst),
                    cluster,
                    str(e),
                ))
            paasta_print(error_msg)
            raise e
        except chronos_tools.InvalidParentError as e:
            raise e

    if not args.run_all_related_jobs:
        sorted_jobs = [(service, instance)]
    else:
        sorted_jobs = chronos_tools.topological_sort_related_jobs(
            cluster, service, instance, soa_dir=args.soa_dir)

    timestamp = datetime.datetime.utcnow().isoformat()

    chronos_to_add = []
    for (service, instance) in sorted_jobs:
        # complete_job_config is a formatted version of the job,
        # so the command is formatted in the context of 'now'
        # replace it with the 'original' cmd so it can be re rendered
        chronos_job_config = chronos_tools.load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=args.soa_dir,
        )
        original_command = chronos_job_config.get_cmd()
        complete_job_config = complete_job_configs[(service, instance)]
        complete_job_config['command'] = original_command
        clone = clone_job(
            chronos_job=complete_job_config,
            timestamp=timestamp,
            force_disabled=args.force_disabled,
        )
        # modify the command to run commands for a given date
        clone = modify_command_for_date(
            chronos_job=clone,
            date=datetime.datetime.strptime(args.execution_date,
                                            "%Y-%m-%dT%H:%M:%S"),
            verbose=args.verbose,
        )

        if not args.run_all_related_jobs and chronos_tools.get_job_type(
                clone) == chronos_tools.JobType.Dependent:
            # If the job is a dependent job and we want to re-run only the specific instance
            # remove the parents and update the schedule to start the job as soon as possible
            clone = set_default_schedule(remove_parents(clone))

        chronos_to_add.append(clone)

    for job_to_add in chronos_to_add:
        client.add(job_to_add)
Example #19
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(
        mesos_state=mesos_state,
    )
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [HealthCheckResult(message='Marathon is not configured to run here', healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except (ServerNotFoundError, socket_error) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [HealthCheckResult(message='Chronos is not configured to run here', healthy=True)]

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
        print "Master paasta_tools version: {0}".format(__version__)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #20
0
def perform_command(command, service, instance, cluster, verbose, soa_dir):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    chronos_config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(chronos_config)
    job_config = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    complete_job_config = chronos_tools.create_complete_config(service,
                                                               instance,
                                                               soa_dir=soa_dir)
    job_id = complete_job_config["name"]

    if command == "start":
        start_chronos_job(
            service=service,
            instance=instance,
            job_id=job_id,
            client=client,
            cluster=cluster,
            job_config=job_config,
            complete_job_config=complete_job_config,
            emergency=True,
        )
    elif command == "stop":
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
            include_temporary=True)
        stop_chronos_job(service,
                         instance,
                         client,
                         cluster,
                         matching_jobs,
                         emergency=True)
    elif command == "restart":
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
        )
        restart_chronos_job(
            service=service,
            instance=instance,
            job_id=job_id,
            client=client,
            cluster=cluster,
            matching_jobs=matching_jobs,
            job_config=job_config,
            complete_job_config=complete_job_config,
            emergency=True,
        )
    elif command == "status":
        # Verbose mode shows previous versions.
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
        )
        sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs)
        job_config = chronos_tools.load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
        paasta_print(
            status_chronos_jobs(client, sorted_matching_jobs, job_config,
                                verbose))
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Example #21
0
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    running_jobs = set(deployed_job_names(client))

    expected_service_jobs = {chronos_tools.compose_job_id(*job) for job in
                             chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)}

    all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs)))
    expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir))
    valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs

    to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs

    task_responses = cleanup_tasks(client, to_delete)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)
            try:
                (service, instance) = chronos_tools.decompose_job_id(response[0])
                monitoring_tools.send_event(
                    check_name=check_chronos_job_name(service, instance),
                    service=service,
                    overrides={},
                    soa_dir=soa_dir,
                    status=pysensu_yelp.Status.OK,
                    output="This instance was removed and is no longer supposed to be scheduled.",
                )
            except InvalidJobNameError:
                # If we deleted some bogus job with a bogus jobid that could not be parsed,
                # Just move on, no need to send any kind of paasta event.
                pass

    if len(to_delete) == 0:
        paasta_print('No Chronos Jobs to remove')
    else:
        if len(task_successes) > 0:
            paasta_print(format_list_output(
                "Successfully Removed Tasks (if any were running) for:",
                [job[0] for job in task_successes],
            ))

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            paasta_print(format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]))

        if len(job_successes) > 0:
            paasta_print(format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]))

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            paasta_print(format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]))

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    try:
        mesos_state = get_mesos_state_from_leader()
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print (PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = get_mesos_state_status(mesos_state=mesos_state)
    metrics = get_mesos_stats()
    mesos_metrics_status = get_mesos_metrics_health(mesos_metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status

    # Check to see if Marathon should be running here by checking for config
    try:
        marathon_config = marathon_tools.load_marathon_config()
    except MarathonNotConfigured:
        marathon_results = [HealthCheckResult(message="Marathon is not configured to run here", healthy=True)]

    # Check to see if Chronos should be running here by checking for config
    try:
        chronos_config = load_chronos_config()
    except PaastaNotConfiguredError:
        chronos_results = [HealthCheckResult(message="Chronos is not configured to run here", healthy=True)]

    if marathon_config:
        marathon_client = get_marathon_client(marathon_config)
        try:
            marathon_results = get_marathon_status(marathon_client)
        except MarathonError as e:
            print (PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = get_chronos_status(chronos_client)
        except ServerNotFoundError as e:
            print (PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)

    mesos_ok = all(status_for_results(all_mesos_results))
    marathon_ok = all(status_for_results(marathon_results))
    chronos_ok = all(status_for_results(chronos_results))

    mesos_summary = generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    if args.verbose == 0:
        print mesos_summary
        print marathon_summary
        print chronos_summary
    elif args.verbose == 1:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    elif args.verbose == 2:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent("Resources Grouped by %s" % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict["total"], free=resource_info_dict["free"]
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
                )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
        print marathon_summary
        print_results_for_healthchecks(marathon_ok, marathon_results, args.verbose)
        print chronos_summary
        print_results_for_healthchecks(chronos_ok, chronos_results, args.verbose)
    else:
        print mesos_summary
        print_results_for_healthchecks(mesos_ok, all_mesos_results, args.verbose)
        for grouping in args.groupings:
            print_with_indent("Resources Grouped by %s" % grouping, 2)
            resource_info_dict = get_resource_utilization_by_grouping(key_func_for_attribute(grouping), mesos_state)
            all_rows = [[grouping.capitalize(), "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = resource_utillizations_from_resource_info(
                    total=resource_info_dict["total"], free=resource_info_dict["free"]
                )
                healthcheck_utilization_pairs = [
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
                )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        print_with_indent("Per Slave Utilization", 2)
        slave_resource_dict = get_resource_utilization_by_grouping(lambda slave: slave["hostname"], mesos_state)
        all_rows = [["Hostname", "CPU (free/total)", "RAM (free/total)", "Disk (free/total)"]]

        # print info about slaves here. Note that we don't make modifications to
        # the healthy_exit variable here, because we don't care about a single slave
        # having high usage.
        for attribute_value, resource_info_dict in slave_resource_dict.items():
            table_rows = []
            resource_utilizations = resource_utillizations_from_resource_info(
                total=resource_info_dict["total"], free=resource_info_dict["free"]
            )
            healthcheck_utilization_pairs = [
                healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold)
                for utilization in resource_utilizations
            ]
            table_rows.append(
                get_table_rows_for_resource_info_dict(attribute_value, healthcheck_utilization_pairs, args.humanize)
            )
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #23
0
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import unicode_literals

from paasta_tools import chronos_tools

if __name__ == '__main__':
    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)
    jobs = [job['name'] for job in client.list()]

    for job in jobs:
        client.delete(job)
Example #24
0
def main(argv=None):
    marathon_config = None
    chronos_config = None
    args = parse_args(argv)

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state, )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(
        mesos_metrics=metrics, mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(
        metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(
                marathon_client)
        except (MarathonError, InternalServerError, ValueError) as e:
            # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream
            paasta_print(
                PaastaColors.red(
                    ("CRITICAL: Unable to contact Marathon cluster at {}!"
                     "Is the cluster healthy?".format(
                         marathon_config["url"]))))
            sys.exit(2)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message='Marathon is not configured to run here', healthy=True)
        ]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message='Chronos is not configured to run here', healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function, mesos_state)
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)',
                'Disk (used/total)'
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items(
            ):
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy
                                   for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = reduce(lambda x, y: x + [(y)],
                           get_autoscaling_info_for_all_resources(), [headers])

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'], mesos_state)
            all_rows = [[
                'Hostname', 'CPU (used/total)', 'RAM (used//total)',
                'Disk (used//total)'
            ]]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items(
            ):
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
def main():
    configure_log()
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    try:
        service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER)
        print 'service, instance'
        print service, instance
    except InvalidJobNameError:
        log.error("Invalid service instance '%s' specified. Format is service%sinstance."
                  % (args.service_instance, SPACER))
        sys.exit(1)

    client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config())
    cluster = load_system_paasta_config().get_cluster()

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=soa_dir,
        )
    except (NoDeploymentsAvailable, NoDockerImageError):
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance, cluster)
        send_event(
            service=service,
            instance=None,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.UnknownChronosJobError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) +
            "Error was: %s" % str(e))
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.InvalidParentError:
        log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance))
        sys.exit(0)

    status, output = setup_job(
        service=service,
        instance=instance,
        cluster=cluster,
        complete_job_config=complete_job_config,
        client=client,
    )
    sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
    send_event(
        service=service,
        instance=instance,
        soa_dir=soa_dir,
        status=sensu_status,
        output=output,
    )
    # We exit 0 because the script finished ok and the event was sent to the right team.
    sys.exit(0)
Example #26
0
def main():
    configure_log()
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    try:
        service, instance, _, __ = decompose_job_id(args.service_instance, spacer=chronos_tools.INTERNAL_SPACER)
        print 'service, instance'
        print service, instance
    except InvalidJobNameError:
        log.error("Invalid service instance '%s' specified. Format is service%sinstance."
                  % (args.service_instance, SPACER))
        sys.exit(1)

    client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config())
    cluster = load_system_paasta_config().get_cluster()

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=soa_dir,
        )
    except (NoDeploymentsAvailable, NoDockerImageError):
        error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % (
            args.service_instance, cluster)
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.UnknownChronosJobError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) +
            "Error was: %s" % str(e))
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.InvalidParentError:
        log.warn("Skipping %s.%s: Parent job could not be found" % (service, instance))
        sys.exit(0)

    status, output = setup_job(
        service=service,
        instance=instance,
        cluster=cluster,
        complete_job_config=complete_job_config,
        client=client,
    )
    sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
    send_event(
        service=service,
        instance=instance,
        soa_dir=soa_dir,
        status=sensu_status,
        output=output,
    )
    # We exit 0 because the script finished ok and the event was sent to the right team.
    sys.exit(0)
Example #27
0
def main():
    configure_log()
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    try:
        service, instance, _, __ = decompose_job_id(args.service_instance)
    except InvalidJobNameError:
        log.error("Invalid service instance '%s' specified. Format is service%sinstance."
                  % (args.service_instance, SPACER))
        sys.exit(1)

    client = chronos_tools.get_chronos_client(chronos_tools.load_chronos_config())
    cluster = load_system_paasta_config().get_cluster()

    try:
        chronos_job_config = chronos_tools.load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        error_msg = "No deployments found for %s in cluster %s" % (args.service_instance, cluster)
        send_event(
            service=service,
            instance=None,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself
        sys.exit(0)
    except chronos_tools.InvalidChronosConfigError as e:
        error_msg = (
            "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) +
            "Error was: %s" % str(e))
        log.error(error_msg)
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        # exit 0 because the event was sent to the right team and this is not an issue with Paasta itself
        sys.exit(0)

    complete_job_config = chronos_tools.create_complete_config(
        service=service,
        job_name=instance,
        soa_dir=soa_dir,
    )
    status, output = setup_job(
        service=service,
        instance=instance,
        cluster=cluster,
        chronos_job_config=chronos_job_config,
        complete_job_config=complete_job_config,
        client=client,
    )
    sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
    send_event(
        service=service,
        instance=instance,
        soa_dir=soa_dir,
        status=sensu_status,
        output=output,
    )
    # We exit 0 because the script finished ok and the event was sent to the right team.
    sys.exit(0)
Example #28
0
def main(argv: Optional[List[str]] = None) -> None:
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(
        get_marathon_clients(marathon_servers))

    try:
        mesos_state = a_sync.block(master.state)
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % '\n'.join(e.args)))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message='Chronos is not configured to run here',
                healthy=True,
            )
        ]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1:
        print_with_indent(
            'Resources Grouped by %s' % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state,
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #29
0
def main(argv=None):
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

    try:
        mesos_state = master.state
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
            marathon_clients=marathon_clients,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function,
                mesos_state,
            )
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)',
                'GPU (used/total)', 'Agent count',
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ) + [str(resource_info_dict['slave_count'])])
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = functools.reduce(
                lambda x, y: x + [(y)],
                get_autoscaling_info_for_all_resources(mesos_state),
                [headers],
            )

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'],
                mesos_state,
            )
            all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #30
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    try:
        service, instance, _, __ = decompose_job_id(
            args.service_instance, spacer=chronos_tools.INTERNAL_SPACER)
    except InvalidJobNameError:
        log.error(
            "Invalid service instance '%s' specified. Format is service%sinstance."
            % (args.service_instance, SPACER))
        sys.exit(1)

    client = chronos_tools.get_chronos_client(
        chronos_tools.load_chronos_config())
    cluster = load_system_paasta_config().get_cluster()

    try:
        complete_job_config = chronos_tools.create_complete_config(
            service=service,
            job_name=instance,
            soa_dir=soa_dir,
        )
    except (NoDeploymentsAvailable, NoDockerImageError):
        error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format(
            args.service_instance,
            cluster,
        )
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except NoConfigurationForServiceError as e:
        error_msg = (
            f"Could not read chronos configuration file for {args.service_instance} in cluster {cluster}\n"
            + "Error was: %s" % str(e))
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except NoSlavesAvailableError as e:
        error_msg = (
            f"There are no PaaSTA slaves that can run {args.service_instance} in cluster {cluster}\n"
            +
            "Double check the cluster and the configured constraints/pool/whitelist.\n"
            "Error was: %s" % str(e))
        send_event(
            service=service,
            instance=instance,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.CRITICAL,
            output=error_msg,
        )
        log.error(error_msg)
        sys.exit(0)
    except chronos_tools.InvalidParentError:
        log.warn(
            f"Skipping {service}.{instance}: Parent job could not be found")
        sys.exit(0)

    modified_config = config_with_historical_stats(
        chronos_client=client,
        service=service,
        instance=instance,
        job_config=complete_job_config,
    )

    status, output = setup_job(
        service=service,
        instance=instance,
        cluster=cluster,
        complete_job_config=modified_config,
        client=client,
    )

    sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
    send_event(
        service=service,
        instance=instance,
        soa_dir=soa_dir,
        status=sensu_status,
        output=output,
    )
    # We exit 0 because the script finished ok and the event was sent to the right team.
    sys.exit(0)
Example #31
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs["use_mesos_cache"] = True

        master = get_mesos_master(**master_kwargs)

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(
            get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(mesos_master=master,
                                                  mesos_state=mesos_state)
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            paasta_print(PaastaColors.red("CRITICAL:  %s" % "\n".join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message="Marathon is not configured to run here", healthy=True)
        ]
        all_mesos_results = [
            metastatus_lib.HealthCheckResult(
                message="Mesos is not configured to run here", healthy=True)
        ]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [
            metastatus_lib.HealthCheckResult(
                message="Kubernetes is not configured to run here",
                healthy=True)
        ]

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            raise FatalError(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message="Chronos is not configured to run here", healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    kube_summary = metastatus_lib.generate_summary_for_check(
        "Kubernetes", kube_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    paasta_print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1 and mesos_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Slave Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok,
                                                  kube_results, args.verbose)
    if args.verbose > 1 and kube_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings,
            threshold=args.threshold,
            kube_client=kube_client)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("No autoscaling resources for Kubernetes", 2)

        if args.verbose >= 3:
            print_with_indent("Per Node Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        raise FatalError(2)
Example #32
0
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    running_jobs = set(deployed_job_names(client))

    expected_service_jobs = set([chronos_tools.compose_job_id(*job) for job in
                                 chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)])

    all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs)))
    expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs))
    valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs

    to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs

    task_responses = cleanup_tasks(client, to_delete)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)
            try:
                (service, instance) = chronos_tools.decompose_job_id(response[0])
                send_event(
                    service=service,
                    instance=instance,
                    monitoring_overrides={},
                    soa_dir=soa_dir,
                    status_code=pysensu_yelp.Status.OK,
                    message="This instance was removed and is no longer supposed to be scheduled.",
                )
            except InvalidJobNameError:
                # If we deleted some bogus job with a bogus jobid that could not be parsed,
                # Just move on, no need to send any kind of paasta event.
                pass

    if len(to_delete) == 0:
        print 'No Chronos Jobs to remove'
    else:
        if len(task_successes) > 0:
            print format_list_output("Successfully Removed Tasks (if any were running) for:",
                                     [job[0] for job in task_successes])

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])

        if len(job_successes) > 0:
            print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)
Example #33
0
def perform_command(command, service, instance, cluster, verbose, soa_dir):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    chronos_config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(chronos_config)
    job_config = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    complete_job_config = chronos_tools.create_complete_config(service, instance, soa_dir=soa_dir)
    job_id = complete_job_config["name"]

    if command == "start":
        start_chronos_job(
            service=service,
            instance=instance,
            job_id=job_id,
            client=client,
            cluster=cluster,
            job_config=job_config,
            complete_job_config=complete_job_config,
            emergency=True,
        )
    elif command == "stop":
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
            include_temporary=True
        )
        stop_chronos_job(service, instance, client, cluster, matching_jobs, emergency=True)
    elif command == "restart":
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
        )
        restart_chronos_job(
            service=service,
            instance=instance,
            job_id=job_id,
            client=client,
            cluster=cluster,
            matching_jobs=matching_jobs,
            job_config=job_config,
            complete_job_config=complete_job_config,
            emergency=True,
        )
    elif command == "status":
        # Verbose mode shows previous versions.
        matching_jobs = chronos_tools.lookup_chronos_jobs(
            service=service,
            instance=instance,
            client=client,
            include_disabled=True,
        )
        sorted_matching_jobs = chronos_tools.sort_jobs(matching_jobs)
        job_config = chronos_tools.load_chronos_job_config(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
        paasta_print(status_chronos_jobs(client, sorted_matching_jobs, job_config, verbose))
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Example #34
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state,
    )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics,
                                                                                mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here',
                                                             healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here',
                                                            healthy=True)]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    print "Master paasta_tools version: {0}".format(__version__)
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function,
                                                                                     mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'],
                                                                                      mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #35
0
 def chronos(self) -> ChronosClient:
     if self._chronos is None:
         chronos_config = chronos_tools.load_chronos_config()
         self._chronos = chronos_tools.get_chronos_client(
             chronos_config, cached=self._cached)
     return self._chronos