Beispiel #1
0
def setup_paasta_api():
    if os.environ.get("PAASTA_API_DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    # pyinotify is a better solution than turning off file caching completely
    service_configuration_lib.disable_yaml_cache()

    settings.system_paasta_config = load_system_paasta_config()
    settings.cluster = settings.system_paasta_config.get_cluster()

    settings.marathon_clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(settings.system_paasta_config),
    )

    settings.marathon_servers = marathon_tools.get_marathon_servers(system_paasta_config=settings.system_paasta_config)
    settings.marathon_clients = marathon_tools.get_marathon_clients(
        marathon_servers=settings.marathon_servers,
        cached=False,
    )

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api", backend="memory", expire_after=5)
Beispiel #2
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    if autoscaling_is_paused():
        log.warning("Skipping autoscaling because autoscaler paused")
        return

    try:
        with create_autoscaling_lock():
            system_paasta_config = load_system_paasta_config()
            cluster = system_paasta_config.get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster,
                                                       soa_dir=soa_dir)

            marathon_clients = get_marathon_clients(
                get_marathon_servers(system_paasta_config))
            apps_with_clients = get_marathon_apps_with_clients(
                marathon_clients.get_all_clients(), embed_tasks=True)
            all_mesos_tasks = get_all_running_tasks()
            if configs:
                with ZookeeperPool():
                    for config in configs:
                        try:
                            marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                                [app for (app, client) in apps_with_clients],
                                all_mesos_tasks,
                                config,
                            )
                            autoscale_marathon_instance(
                                config, list(marathon_tasks.values()),
                                mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config,
                                         line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for services because the lock is held")
def autoscale_service_configs(
    service_configs: Sequence[MarathonServiceConfig],
    system_paasta_config: SystemPaastaConfig,
) -> None:
    if autoscaling_is_paused():
        log.warning("Skipping autoscaling because autoscaler paused")
        return

    marathon_clients = get_marathon_clients(
        get_marathon_servers(system_paasta_config))
    apps_with_clients = get_marathon_apps_with_clients(
        marathon_clients.get_all_clients(), embed_tasks=True)
    all_mesos_tasks = a_sync.block(get_all_running_tasks)
    with ZookeeperPool():
        for config in service_configs:
            try:
                marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                    [app for (app, client) in apps_with_clients],
                    all_mesos_tasks,
                    config,
                    system_paasta_config,
                )
                autoscale_marathon_instance(
                    config,
                    system_paasta_config,
                    list(marathon_tasks.values()),
                    mesos_tasks,
                )
            except Exception as e:
                write_to_log(config=config,
                             line="Caught Exception %s" % e,
                             level="debug")
Beispiel #4
0
 def setup(self) -> None:
     system_paasta_config = load_system_paasta_config()
     self.marathon_servers = marathon_tools.get_marathon_servers(
         system_paasta_config)
     self.marathon_clients = marathon_tools.get_marathon_clients(
         self.marathon_servers)
     self.max_failures = (
         system_paasta_config.get_deployd_max_service_instance_failures())
Beispiel #5
0
 def marathon(self) -> marathon_tools.MarathonClients:
     if self._marathon is None:
         system_paasta_config = load_system_paasta_config()
         marathon_servers = marathon_tools.get_marathon_servers(
             system_paasta_config)
         self._marathon = marathon_tools.get_marathon_clients(
             marathon_servers, cached=True)
     return self._marathon
Beispiel #6
0
 def setup(self) -> None:
     system_paasta_config = load_system_paasta_config()
     self.marathon_servers = marathon_tools.get_marathon_servers(
         system_paasta_config
     )
     self.marathon_clients = marathon_tools.get_marathon_clients(
         self.marathon_servers
     )
Beispiel #7
0
def main() -> None:
    """Attempt to set up a list of marathon service instances given.
    Exits 1 if any service.instance deployment failed.
    This is done in the following order:

    - Load the marathon configuration
    - Connect to marathon
    - Do the following for each service.instance:
        - Load the service instance's configuration
        - Create the complete marathon job configuration
        - Deploy/bounce the service
        - Emit an event about the deployment to sensu"""

    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    # Setting up transparent cache for http API calls
    requests_cache.install_cache("setup_marathon_jobs", backend="memory")

    system_paasta_config = load_system_paasta_config()
    clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(system_paasta_config)
    )
    unique_clients = clients.get_all_clients()
    marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
        unique_clients, embed_tasks=True
    )

    num_failed_deployments = 0
    for service_instance in args.service_instance_list:
        try:
            service, instance, _, __ = decompose_job_id(service_instance)
        except InvalidJobNameError:
            log.error(
                "Invalid service instance specified. Format is service%sinstance."
                % SPACER
            )
            num_failed_deployments = num_failed_deployments + 1
        else:
            if deploy_marathon_service(
                service, instance, clients, soa_dir, marathon_apps_with_clients
            )[0]:
                num_failed_deployments = num_failed_deployments + 1

    requests_cache.uninstall_cache()

    log.debug(
        "%d out of %d service.instances failed to deploy."
        % (num_failed_deployments, len(args.service_instance_list))
    )

    sys.exit(1 if num_failed_deployments else 0)
def get_mesos_tasks_and_slaves(
    system_paasta_config: SystemPaastaConfig,
) -> Tuple[Sequence[MarathonTask], List[Any]]:
    clients = get_marathon_clients(get_marathon_servers(system_paasta_config))
    all_clients: Sequence[MarathonClient] = clients.get_all_clients()
    all_tasks: List[MarathonTask] = []
    for client in all_clients:
        all_tasks.extend(client.list_tasks())
    mesos_slaves = a_sync.block(get_slaves)

    return all_tasks, mesos_slaves
Beispiel #9
0
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    system_paasta_config = load_system_paasta_config()
    log.info("Connecting to marathon")
    clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(system_paasta_config))

    valid_services = get_services_for_cluster(instance_type='marathon',
                                              soa_dir=soa_dir)
    all_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
        clients.get_all_clients())

    app_ids_with_clients = []
    for (app, client) in all_apps_with_clients:
        try:
            app_id = marathon_tools.deformat_job_id(app.id.lstrip('/'))
        except InvalidJobNameError:
            log.warn(
                "%s doesn't conform to paasta naming conventions? Skipping." %
                app.id)
            continue
        app_ids_with_clients.append((app_id, client))
    apps_to_kill = [((service, instance, git_sha, config_sha), client)
                    for (service, instance, git_sha,
                         config_sha), client in app_ids_with_clients
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % app_ids_with_clients)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if app_ids_with_clients:
        above_kill_threshold = float(len(apps_to_kill)) / float(
            len(app_ids_with_clients)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical(
                "Paasta was about to kill more than %s of the running services, this "
                "is probably a BAD mistake!, run again with --force if you "
                "really need to destroy everything" % kill_threshold, )
            raise DontKillEverythingError
    for id_tuple, client in apps_to_kill:
        app_id = marathon_tools.format_job_id(*id_tuple)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )
Beispiel #10
0
def paasta_sysdig(args):
    system_paasta_config = load_system_paasta_config()

    if not args.local:
        mesos_master = get_any_mesos_master(
            cluster=args.cluster, system_paasta_config=system_paasta_config)
        ssh_cmd = ('ssh -At -o StrictHostKeyChecking=no -o LogLevel=QUIET {0} '
                   '"sudo paasta {1} --local"').format(mesos_master,
                                                       ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            paasta_print(output)
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(
            shlex.split("ssh -tA {} '{}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(
        cluster=args.cluster,
        service=args.service,
        instance=args.instance,
    )
    slave = pick_slave_from_status(
        status=status,
        host=args.host,
    )

    job_config = load_marathon_service_config(
        service=args.service,
        instance=args.instance,
        cluster=args.cluster,
    )

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)

    # Unfortunately, sysdig seems to only be able to take one marathon URL, so hopefully the service in question is not
    # currently moving between shards.
    client = marathon_clients.get_current_client_for_service(
        job_config=job_config, )
    marathon_url = client.servers[0]
    marathon_user, marathon_pass = client.auth

    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(netloc="{}:{}@{}".format(
        marathon_user,
        marathon_pass,
        marathon_parsed_url.netloc,
    ))
    paasta_print(
        format_mesos_command(slave, status.marathon.app_id, mesos_url,
                             marathon_creds_url.geturl()))
Beispiel #11
0
def setup_paasta_api():
    if os.environ.get("PAASTA_API_DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    # pyinotify is a better solution than turning off file caching completely
    service_configuration_lib.disable_yaml_cache()

    settings.system_paasta_config = load_system_paasta_config()
    if os.environ.get("PAASTA_API_CLUSTER"):
        settings.cluster = os.environ.get("PAASTA_API_CLUSTER")
    else:
        settings.cluster = settings.system_paasta_config.get_cluster()

    settings.marathon_clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(settings.system_paasta_config), )

    settings.marathon_servers = marathon_tools.get_marathon_servers(
        system_paasta_config=settings.system_paasta_config)
    settings.marathon_clients = marathon_tools.get_marathon_clients(
        marathon_servers=settings.marathon_servers,
        cached=False,
    )

    try:
        settings.kubernetes_client = kubernetes_tools.KubeClient()
    except FileNotFoundError:
        log.info('Kubernetes not found')
        settings.kubernetes_client = None
    except Exception:
        log.exception('Error while initializing KubeClient')
        settings.kubernetes_client = None

    # Set up transparent cache for http API calls. With expire_after, responses
    # are removed only when the same request is made. Expired storage is not a
    # concern here. Thus remove_expired_responses is not needed.
    requests_cache.install_cache("paasta-api",
                                 backend="memory",
                                 expire_after=5)
Beispiel #12
0
def create_marathon_dashboard(
    cluster: str,
    soa_dir: str = DEFAULT_SOA_DIR,
    marathon_clients: MarathonClients = None,
    system_paasta_config: SystemPaastaConfig = None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(
        system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(
            marathon_servers=marathon_servers, cached=False)
    for service_instance in instances:
        service: str = service_instance[0]
        instance: str = service_instance[1]
        service_config: MarathonServiceConfig = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=cluster,
            load_deployments=False,
            soa_dir=soa_dir,
        )
        client: MarathonClient = marathon_clients.get_current_client_for_service(
            job_config=service_config)
        dashboard_links: Dict = system_paasta_config.get_dashboard_links()
        shard_url: str = client.servers[0]
        if 'Marathon RO' in dashboard_links[cluster]:
            marathon_links = dashboard_links[cluster]['Marathon RO']
            if isinstance(marathon_links, list):
                for shard_number, shard in enumerate(marathon_servers.current):
                    if shard.url[0] == shard_url:
                        shard_url = marathon_links[shard_number]
            elif isinstance(marathon_links, str):
                shard_url = marathon_links.split(' ')[0]
        service_info: Marathon_Dashboard_Item = {
            'service': service,
            'instance': instance,
            'shard_url': shard_url,
        }
        dashboard[cluster].append(service_info)
    return dashboard
def test_list_instances():
    settings.cluster = 'fake_cluster'
    system_paasta_config_dict = {
        "marathon_servers": [
            {
                "user": "******",
                "password": "******",
                "url": [
                    "http://marathon:8080",
                ],
            },
            {
                "user": "******",
                "password": "******",
                "url": [
                    "http://marathon1:8080",
                ],
            },
            {
                "user": "******",
                "password": "******",
                "url": [
                    "http://marathon2:8080",
                ],
            },
        ],
        "dashboard_links": {
            "testcluster": {
                "Marathon RO": [
                    "http://accessible-marathon",
                    "http://accessible-marathon1",
                    "http://accessible-marathon2",
                ],
            },
        },
    }
    system_paasta_config = SystemPaastaConfig(config=system_paasta_config_dict,
                                              directory='unused')
    marathon_servers = marathon_tools.get_marathon_servers(
        system_paasta_config)
    settings.marathon_clients = marathon_tools.get_marathon_clients(
        marathon_servers=marathon_servers,
        cached=False,
    )
    request = testing.DummyRequest()

    settings.system_paasta_config = system_paasta_config
    response = marathon_dashboard(request)
    expected_output = {settings.cluster: []}
    assert response == expected_output
Beispiel #14
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    if args.minimal:
        system_paasta_config = load_system_paasta_config()
        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = get_marathon_clients(marathon_servers)
        service_instances = get_service_instances_that_need_bouncing(
            marathon_clients=marathon_clients, soa_dir=soa_dir)
    else:
        instances = get_services_for_cluster(cluster=cluster,
                                             instance_type="marathon",
                                             soa_dir=soa_dir)
        service_instances = []
        for name, instance in instances:
            service_instances.append(compose_job_id(name, instance))
    print("\n".join(service_instances))
    sys.exit(0)
Beispiel #15
0
def check_mesos_no_duplicate_frameworks():
    master = get_mesos_master()
    try:
        state = master.state
    except MasterNotAvailableException as e:
        paasta_print("CRITICAL: %s" % e.message)
        sys.exit(2)

    system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)
    marathon_framework_ids = get_marathon_framework_ids(marathon_clients)
    result = assert_framework_count(
        state=state,
        marathon_framework_ids=marathon_framework_ids,
    )
    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print("CRITICAL: %s" % result.message)
        sys.exit(2)
Beispiel #16
0
def main():
    args = parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(system_paasta_config))
    all_clients = clients.get_all_clients()
    all_tasks = []
    for client in all_clients:
        all_tasks.extend(client.list_tasks())
    mesos_slaves = a_sync.block(get_slaves)
    smartstack_replication_checker = MesosSmartstackReplicationChecker(
        mesos_slaves, system_paasta_config)

    for service in list_services(soa_dir=args.soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=args.soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster,
                instance_type_class=marathon_tools.MarathonServiceConfig,
        ):
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_tasks=all_tasks,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    '%s is not deployed. Skipping replication monitoring.' %
                    instance_config.job_id, )
Beispiel #17
0
def setup_marathon_clients():
    system_paasta_config = setup_system_paasta_config()
    marathon_servers = marathon_tools.get_marathon_servers(
        system_paasta_config)
    clients = marathon_tools.get_marathon_clients(marathon_servers)
    return (clients, marathon_servers, system_paasta_config)
Beispiel #18
0
def create_marathon_dashboard(
        cluster: str,
        soa_dir: str=DEFAULT_SOA_DIR,
        marathon_clients: MarathonClients=None,
        system_paasta_config: SystemPaastaConfig=None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False)

    dashboard_links: Dict = system_paasta_config.get_dashboard_links()
    marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO')

    # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com'
    shard_url_to_marathon_link_dict: Dict[str, str] = {}
    if isinstance(marathon_links, list):
        # Sanity check and log error if necessary
        if len(marathon_links) != len(marathon_servers.current):
            log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern')
        for shard_number, shard in enumerate(marathon_servers.current):
            shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number]
    elif isinstance(marathon_links, str):
        # In this case, the shard url will be the same for every service instance
        static_shard_url = marathon_links.split(' ')[0]
        return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]}

    # Setup with service as key since will instantiate 1 PSCL per service
    service_instances_dict: Dict[str, Set[str]] = defaultdict(set)
    for si in instances:
        service, instance = si[0], si[1]
        service_instances_dict[service].add(instance)

    for service, instance_set in service_instances_dict.items():
        pscl = PaastaServiceConfigLoader(
            service=service,
            soa_dir=soa_dir,
            load_deployments=False,
        )
        for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig):
            if marathon_service_config.get_instance() in instance_set:
                client: MarathonClient = \
                    marathon_clients.get_current_client_for_service(job_config=marathon_service_config)
                ip_url: str = client.servers[0]
                # Convert to a marathon link if possible else default to the originalIP address
                shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url)
                service_info: Marathon_Dashboard_Item = {
                    'service': service,
                    'instance': instance,
                    'shard_url': shard_url,
                }
                dashboard[cluster].append(service_info)
    return dashboard
Beispiel #19
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs["use_mesos_cache"] = True

        master = get_mesos_master(**master_kwargs)

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(
            get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(mesos_master=master,
                                                  mesos_state=mesos_state)
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            paasta_print(PaastaColors.red("CRITICAL:  %s" % "\n".join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message="Marathon is not configured to run here", healthy=True)
        ]
        all_mesos_results = [
            metastatus_lib.HealthCheckResult(
                message="Mesos is not configured to run here", healthy=True)
        ]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [
            metastatus_lib.HealthCheckResult(
                message="Kubernetes is not configured to run here",
                healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    kube_summary = metastatus_lib.generate_summary_for_check(
        "Kubernetes", kube_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    paasta_print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1 and mesos_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Slave Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok,
                                                  kube_results, args.verbose)
    if args.verbose > 1 and kube_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings,
            threshold=args.threshold,
            kube_client=kube_client)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("No autoscaling resources for Kubernetes", 2)

        if args.verbose >= 3:
            print_with_indent("Per Node Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)

    if not healthy_exit:
        raise FatalError(2)
Beispiel #20
0
def get_marathon_clients_from_config() -> MarathonClients:
    system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)
    return marathon_clients
Beispiel #21
0
def main(argv: Optional[List[str]] = None) -> None:
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(
        get_marathon_clients(marathon_servers))

    try:
        mesos_state = a_sync.block(master.state)
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % '\n'.join(e.args)))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message='Chronos is not configured to run here',
                healthy=True,
            )
        ]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1:
        print_with_indent(
            'Resources Grouped by %s' % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state,
        )
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Beispiel #22
0
def perform_command(command,
                    service,
                    instance,
                    cluster,
                    verbose,
                    soa_dir,
                    app_id=None,
                    delta=None,
                    clients=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    job_config = marathon_tools.load_marathon_service_config(service,
                                                             instance,
                                                             cluster,
                                                             soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id)
            return 1

    normal_instance_count = job_config.get_instances()
    proxy_port = marathon_tools.get_proxy_port_for_instance(service,
                                                            instance,
                                                            cluster,
                                                            soa_dir=soa_dir)

    if clients is None:
        clients = marathon_tools.get_marathon_clients(
            system_config.get_marathon_servers())

    current_client = clients.get_current_client_for_service(job_config)

    if command == 'restart':
        restart_marathon_job(service, instance, app_id, current_client,
                             cluster)
    elif command == 'status':
        paasta_print(
            status_desired_state(service, instance, current_client,
                                 job_config))
        paasta_print(
            status_marathon_job(service, instance, app_id,
                                normal_instance_count, current_client))
        dashboards = get_marathon_dashboard_links(clients, system_config)
        tasks, out = status_marathon_job_verbose(service, instance, clients,
                                                 cluster, soa_dir, job_config,
                                                 dashboards)
        if verbose > 0:
            paasta_print(out)
        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count))
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            paasta_print(
                status_mesos_tasks_verbose(
                    job_id=app_id,
                    get_short_task_id=get_short_task_id,
                    tail_lines=tail_lines,
                ))
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service,
                instance,
                cluster,
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.
                    get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(
                    ),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(
                    ),
                ))
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Beispiel #23
0
def main(argv=None):
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

    try:
        mesos_state = master.state
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
            marathon_clients=marathon_clients,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function,
                mesos_state,
            )
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)',
                'GPU (used/total)', 'Agent count',
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ) + [str(resource_info_dict['slave_count'])])
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = functools.reduce(
                lambda x, y: x + [(y)],
                get_autoscaling_info_for_all_resources(mesos_state),
                [headers],
            )

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'],
                mesos_state,
            )
            all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)