Example #1
0
def check_mesos_no_duplicate_frameworks():
    master = get_mesos_master()
    try:
        state = master.state
    except MasterNotAvailableException as e:
        paasta_print("CRITICAL: %s" % e.message)
        sys.exit(2)

    marathon_clients = marathon_tools.get_list_of_marathon_clients()
    try:
        framework_ids = get_marathon_framework_ids(marathon_clients)
    except (MarathonError, ValueError) as e:
        paasta_print(
            "CRITICAL: Unable to contact Marathon cluster: {}".format(e))
        sys.exit(2)

    result = assert_framework_count(
        state=state,
        marathon_framework_ids=framework_ids,
    )
    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print("CRITICAL: %s" % result.message)
        sys.exit(2)
Example #2
0
def main(hostnames):
    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    slaves = [slave for slave in mesos_state.get('slaves', []) if slave['hostname'] in hostnames]
    tasks = get_all_tasks_from_state(mesos_state, include_orphans=True)
    filtered_tasks = filter_tasks_for_slaves(slaves, tasks)
    resource_info_dict = calculate_resource_utilization_for_slaves(slaves, filtered_tasks)
    resource_utilizations = resource_utillizations_from_resource_info(
        total=resource_info_dict['total'],
        free=resource_info_dict['free'],
    )
    output = {}
    for metric in resource_utilizations:
        utilization = metric.total - metric.free
        if int(metric.total) == 0:
            utilization_perc = 100
        else:
            utilization_perc = utilization / float(metric.total) * 100
        output[metric.metric] = {
            'total': metric.total,
            'used': utilization,
            'perc': utilization_perc,
        }
    print(json.dumps(output))
Example #3
0
    def scale_resource(self, current_capacity, target_capacity):
        """Scales an AWS resource based on current and target capacity
        If scaling up we just set target capacity and let AWS take care of the rest
        If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
        mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
        them once they are running 0 tasks or once a timeout is reached

        :param current_capacity: integer current resource capacity
        :param target_capacity: target resource capacity
        """
        target_capacity = int(target_capacity)
        delta = target_capacity - current_capacity
        if delta == 0:
            self.log.info("Already at target capacity: {}".format(target_capacity))
            return
        elif delta > 0:
            self.log.info("Increasing resource capacity to: {}".format(target_capacity))
            self.set_capacity(target_capacity)
            return
        elif delta < 0:
            mesos_state = get_mesos_master().state_summary()
            slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=self.resource['pool'])
            filtered_slaves = self.filter_aws_slaves(slaves_list)
            killable_capacity = sum([slave.instance_weight for slave in filtered_slaves])
            amount_to_decrease = delta * -1
            if amount_to_decrease > killable_capacity:
                self.log.error(
                    "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!"
                )
                return
            self.downscale_aws_resource(
                filtered_slaves=filtered_slaves,
                current_capacity=current_capacity,
                target_capacity=target_capacity)
Example #4
0
def paasta_sysdig(args):
    if not args.local:
        mesos_master = get_any_mesos_master(cluster=args.cluster)
        ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format(mesos_master, ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            print output
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(cluster=args.cluster,
                                     service=args.service,
                                     instance=args.instance)
    slave = pick_slave_from_status(status=status,
                                   host=args.host)
    marathon_config = load_marathon_config()
    marathon_url = marathon_config.get_url()[0]
    marathon_user = marathon_config.get_username()
    marathon_pass = marathon_config.get_password()
    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass,
                                                                                  marathon_parsed_url.netloc))
    print format_mesos_command(slave, status.marathon.app_id, mesos_url, marathon_creds_url.geturl())
 def metrics_provider(self):
     if not self.asg:
         self.log.warning("ASG {} not found, removing config file".format(
             self.resource['id']))
         self.cleanup_cancelled_config(self.resource['id'],
                                       self.config_folder,
                                       dry_run=self.dry_run)
         return 0, 0
     if self.is_aws_launching_instances():
         self.log.warning(
             "ASG still launching new instances so we won't make any"
             "changes this time.")
         return 0, 0
     expected_instances = len(self.instances)
     if expected_instances == 0:
         self.log.warning(
             "This ASG has no instances, delta should be 1 to "
             "launch first instance unless max/min capacity override")
         return self.get_asg_delta(1)
     mesos_state = get_mesos_master().state
     slaves = self.get_aws_slaves(mesos_state)
     error = self.get_mesos_utilization_error(
         slaves=slaves,
         mesos_state=mesos_state,
         expected_instances=expected_instances)
     return self.get_asg_delta(error)
Example #6
0
def unreserve_all_resources(hostnames):
    """Dynamically unreserve all available resources on the specified hosts
    :param hostnames: list of hostnames to unreserve resources on
    """
    mesos_state = get_mesos_master().state_summary()
    components = hostnames_to_components(hostnames)
    hosts = components_to_hosts(components)
    known_slaves = [
        slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts
    ]
    for slave in known_slaves:
        hostname = slave['hostname']
        log.info("Unreserving all resources on %s" % hostname)
        slave_id = slave['id']
        resources = []
        for role in slave['reserved_resources']:
            for resource in ['disk', 'mem', 'cpus']:
                reserved_resource = slave['reserved_resources'][role][resource]
                resources.append(
                    Resource(name=resource, amount=reserved_resource))
        try:
            unreserve(slave_id=slave_id, resources=resources)
        except HTTPError:
            raise HTTPError(
                "Failed unreserving all of the resources on %s (%s). Aborting."
                % (hostname, slave_id))
Example #7
0
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity,
                                 target_capacity, pool_settings, dry_run):
    while True:
        filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves)
        if len(filtered_sorted_slaves) == 0:
            break
        log.info("SFR slave kill preference: {0}".format(
            [slave['hostname'] for slave in filtered_sorted_slaves]))
        filtered_sorted_slaves.reverse()
        slave_to_kill = filtered_sorted_slaves.pop()
        instance_capacity = slave_to_kill['instance_weight']
        new_capacity = current_capacity - instance_capacity
        if new_capacity < target_capacity:
            log.info(
                "Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as"
                " close to our target as we can get".format(
                    slave_to_kill['instance_id'],
                    slave_to_kill['instance_weight'], target_capacity))
            break
        try:
            gracefully_terminate_slave(resource=resource,
                                       slave_to_kill=slave_to_kill,
                                       pool_settings=pool_settings,
                                       current_capacity=current_capacity,
                                       new_capacity=new_capacity,
                                       dry_run=dry_run)
        except HTTPError:
            # Something wrong draining host so try next host
            continue
        except FailSetSpotCapacity:
            break
        current_capacity = new_capacity
        mesos_state = get_mesos_master().state_summary()
        filtered_slaves = get_mesos_task_count_by_slave(
            mesos_state, slaves_list=filtered_sorted_slaves)
Example #8
0
def paasta_sysdig(args):
    if not args.local:
        mesos_master = get_any_mesos_master(cluster=args.cluster)
        ssh_cmd = 'ssh -At -o LogLevel=QUIET {0} "sudo paasta {1} --local"'.format(
            mesos_master, ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            print output
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(
            shlex.split("ssh -tA {0} '{1}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(cluster=args.cluster,
                                     service=args.service,
                                     instance=args.instance)
    slave = pick_slave_from_status(status=status, host=args.host)
    marathon_config = load_marathon_config()
    marathon_url = marathon_config.get_url()[0]
    marathon_user = marathon_config.get_username()
    marathon_pass = marathon_config.get_password()
    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(
        netloc="{0}:{1}@{2}".format(marathon_user, marathon_pass,
                                    marathon_parsed_url.netloc))
    print format_mesos_command(slave, status.marathon.app_id, mesos_url,
                               marathon_creds_url.geturl())
def check_registration(threshold_percentage):
    mesos_state = get_mesos_master().state
    autoscaling_resources = load_system_paasta_config(
    ).get_cluster_autoscaling_resources()
    for resource in autoscaling_resources.values():
        print("Checking %s" % resource['id'])
        try:
            scaler = get_scaler(resource['type'])(resource=resource,
                                                  pool_settings=None,
                                                  config_folder=None,
                                                  dry_run=True)
        except KeyError:
            print("Couldn't find a metric provider for resource of type: {}".
                  format(resource['type']))
            continue
        if len(scaler.instances) == 0:
            print("No instances for this resource")
            continue
        else:
            slaves = scaler.get_aws_slaves(mesos_state)
            percent_registered = float(
                float(len(slaves)) / float(len(scaler.instances))) * 100
            if percent_registered < float(threshold_percentage):
                print(
                    "CRIT: Only found {}% of instances in {} registered in mesos. "
                    "Please check for puppet or AMI baking problems!".format(
                        percent_registered, resource['id']))
                return False
    print(
        "OK: Found more than {}% of instances registered for all paasta resources in this "
        "superregion".format(threshold_percentage))
    return True
Example #10
0
def unreserve_all_resources(hostnames):
    """Dynamically unreserve all available resources on the specified hosts
    :param hostnames: list of hostnames to unreserve resources on
    """
    mesos_state = a_sync.block(get_mesos_master().state_summary)
    components = hostnames_to_components(hostnames)
    hosts = components_to_hosts(components)
    known_slaves = [
        slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts
    ]
    for slave in known_slaves:
        hostname = slave["hostname"]
        log.info("Unreserving all resources on %s" % hostname)
        slave_id = slave["id"]
        resources = []
        if MAINTENANCE_ROLE in slave["reserved_resources"]:
            for resource in ["disk", "mem", "cpus", "gpus"]:
                reserved_resource = slave["reserved_resources"][
                    MAINTENANCE_ROLE][resource]
                resources.append(
                    Resource(name=resource, amount=reserved_resource))
            try:
                unreserve(slave_id=slave_id, resources=resources)
            except HTTPError:
                raise HTTPError(
                    f"Failed unreserving all of the resources on {hostname} ({slave_id}). Aborting."
                )
Example #11
0
def main(hostnames: Sequence[str]) -> None:
    master = get_mesos_master()
    try:
        mesos_state = block(master.state)
    except MasterNotAvailableException as e:
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    slaves = [
        slave for slave in mesos_state.get("slaves", [])
        if slave["hostname"] in hostnames
    ]
    tasks = get_all_tasks_from_state(mesos_state, include_orphans=True)
    filtered_tasks = filter_tasks_for_slaves(slaves, tasks)
    resource_info_dict = calculate_resource_utilization_for_slaves(
        slaves, filtered_tasks)
    resource_utilizations = resource_utillizations_from_resource_info(
        total=resource_info_dict["total"], free=resource_info_dict["free"])
    output = {}
    for metric in resource_utilizations:
        utilization = metric.total - metric.free
        if int(metric.total) == 0:
            utilization_perc = 100
        else:
            utilization_perc = utilization / float(metric.total) * 100
        output[metric.metric] = {
            "total": metric.total,
            "used": utilization,
            "perc": utilization_perc,
        }
    print(json.dumps(output))
Example #12
0
 def metrics_provider(self):
     if not self.sfr or self.sfr['SpotFleetRequestState'] == 'cancelled':
         self.log.error("SFR not found, removing config file.".format(
             self.resource['id']))
         self.cleanup_cancelled_config(self.resource['id'],
                                       self.config_folder,
                                       dry_run=self.dry_run)
         return 0, 0
     elif self.sfr['SpotFleetRequestState'] in [
             'cancelled_running', 'active'
     ]:
         expected_instances = len(self.instances)
         if expected_instances == 0:
             self.log.warning(
                 "No instances found in SFR, this shouldn't be possible so we "
                 "do nothing")
             return 0, 0
         mesos_state = get_mesos_master().state
         slaves = self.get_aws_slaves(mesos_state)
         error = self.get_mesos_utilization_error(
             slaves=slaves,
             mesos_state=mesos_state,
             expected_instances=expected_instances)
     elif self.sfr['SpotFleetRequestState'] in [
             'submitted', 'modifying', 'cancelled_terminating'
     ]:
         self.log.warning(
             "Not scaling an SFR in state: {} so {}, skipping...".format(
                 self.sfr['SpotFleetRequestState'], self.resource['id']))
         return 0, 0
     else:
         self.log.error("Unexpected SFR state: {} for {}".format(
             self.sfr['SpotFleetRequestState'], self.resource['id']))
         raise ClusterAutoscalingError
     if self.is_aws_launching_instances(
     ) and self.sfr['SpotFleetRequestState'] == 'active':
         self.log.warning(
             "AWS hasn't reached the TargetCapacity that is currently set. We won't make any "
             "changes this time as we should wait for AWS to launch more instances first."
         )
         return 0, 0
     current, target = self.get_spot_fleet_delta(error)
     if self.sfr['SpotFleetRequestState'] == 'cancelled_running':
         self.resource['min_capacity'] = 0
         slaves = self.get_pool_slaves(mesos_state)
         pool_error = self.get_mesos_utilization_error(
             slaves=slaves, mesos_state=mesos_state)
         if pool_error > 0:
             self.log.info(
                 "Not scaling cancelled SFR %s because we are under provisioned"
                 % (self.resource['id']))
             return 0, 0
         current, target = self.get_spot_fleet_delta(-1)
         if target == 1:
             target = 0
     return current, target
Example #13
0
def _clean_up_paasta_native_frameworks(context):
    clear_mesos_tools_cache()
    # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where
    # to connect to clean up paasta native frameworks.
    if hasattr(context, 'etc_paasta'):
        for framework in mesos_tools.get_mesos_master().frameworks(active_only=True):
            if framework.name.startswith('paasta '):
                paasta_print("cleaning up framework %s" % framework.name)
                try:
                    mesos_tools.terminate_framework(framework.id)
                except requests.exceptions.HTTPError as e:
                    paasta_print("Got exception when terminating framework %s: %s" % (framework.id, e))
Example #14
0
def remote_run_stop(args):
    _, service, cluster, _, instance, _ = extract_args(args)
    if args.framework_id is None and args.run_id is None:
        paasta_print(
            PaastaColors.red(
                "Must provide either run id or framework id to stop."))
        emit_counter_metric('paasta.remote_run.stop.failed', service, instance)
        sys.exit(1)

    frameworks = [
        f for f in get_all_frameworks(active_only=True)
        if re.search(f'^paasta-remote {service}.{instance}', f.name)
    ]
    framework_id = args.framework_id
    if framework_id is None:
        if re.match('\s', args.run_id):
            paasta_print(
                PaastaColors.red("Run id must not contain whitespace."))
            emit_counter_metric('paasta.remote_run.stop.failed', service,
                                instance)
            sys.exit(1)

        found = [
            f for f in frameworks
            if re.search(' %s$' % args.run_id, f.name) is not None
        ]
        if len(found) > 0:
            framework_id = found[0].id
        else:
            paasta_print(
                PaastaColors.red("Framework with run id %s not found." %
                                 args.run_id))
            emit_counter_metric('paasta.remote_run.stop.failed', service,
                                instance)
            sys.exit(1)
    else:
        found = [f for f in frameworks if f.id == framework_id]
        if len(found) == 0:
            paasta_print(
                PaastaColors.red(
                    "Framework id %s does not match any %s.%s remote-run. Check status to find the correct id."
                    % (framework_id, service, instance), ), )
            emit_counter_metric('paasta.remote_run.stop.failed', service,
                                instance)
            sys.exit(1)

    paasta_print("Tearing down framework %s." % framework_id)
    mesos_master = get_mesos_master()
    teardown = mesos_master.teardown(framework_id)
    if teardown.status_code == 200:
        paasta_print(PaastaColors.green("OK"))
    else:
        paasta_print(teardown.text)
Example #15
0
def paasta_sysdig(args):
    system_paasta_config = load_system_paasta_config()

    if not args.local:
        mesos_master = get_any_mesos_master(
            cluster=args.cluster, system_paasta_config=system_paasta_config)
        ssh_cmd = ('ssh -At -o StrictHostKeyChecking=no -o LogLevel=QUIET {0} '
                   '"sudo paasta {1} --local"').format(mesos_master,
                                                       ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            paasta_print(output)
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(
            shlex.split("ssh -tA {} '{}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(
        cluster=args.cluster,
        service=args.service,
        instance=args.instance,
    )
    slave = pick_slave_from_status(
        status=status,
        host=args.host,
    )

    job_config = load_marathon_service_config(
        service=args.service,
        instance=args.instance,
        cluster=args.cluster,
    )

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)

    # Unfortunately, sysdig seems to only be able to take one marathon URL, so hopefully the service in question is not
    # currently moving between shards.
    client = marathon_clients.get_current_client_for_service(
        job_config=job_config, )
    marathon_url = client.servers[0]
    marathon_user, marathon_pass = client.auth

    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(netloc="{}:{}@{}".format(
        marathon_user,
        marathon_pass,
        marathon_parsed_url.netloc,
    ))
    paasta_print(
        format_mesos_command(slave, status.marathon.app_id, mesos_url,
                             marathon_creds_url.geturl()))
Example #16
0
    def downscale_aws_resource(self, filtered_slaves, current_capacity,
                               target_capacity):
        killed_slaves = 0
        while True:
            filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness(
                filtered_slaves)[::-1]
            if len(filtered_sorted_slaves) == 0:
                self.log.info(
                    "ALL slaves killed so moving on to next resource!")
                break
            self.log.info("Resource slave kill preference: {}".format(
                [slave.hostname for slave in filtered_sorted_slaves]))
            slave_to_kill = filtered_sorted_slaves.pop(0)
            instance_capacity = slave_to_kill.instance_weight
            new_capacity = current_capacity - instance_capacity
            if new_capacity < target_capacity:
                self.log.info(
                    "Terminating instance {} with weight {} would take us below our target of {},"
                    " so this is as close to our target as we can get".format(
                        slave_to_kill.instance_id,
                        slave_to_kill.instance_weight, target_capacity))
                if self.resource[
                        'type'] == 'aws_spot_fleet_request' and killed_slaves == 0:
                    self.log.info(
                        "This is a SFR so we must kill at least one slave to prevent the autoscaler "
                        "getting stuck whilst scaling down gradually")
                else:
                    break
            try:
                self.gracefully_terminate_slave(
                    slave_to_kill=slave_to_kill,
                    current_capacity=current_capacity,
                    new_capacity=new_capacity)
                killed_slaves += 1
            except HTTPError:
                # Something wrong draining host so try next host
                continue
            except FailSetResourceCapacity:
                break

            current_capacity = new_capacity
            mesos_state = get_mesos_master().state_summary()
            if filtered_sorted_slaves:
                task_counts = get_mesos_task_count_by_slave(
                    mesos_state,
                    slaves_list=[{
                        'task_counts': slave.task_counts
                    } for slave in filtered_sorted_slaves])
                for i, slave in enumerate(filtered_sorted_slaves):
                    slave.task_counts = task_counts[i]['task_counts']
            filtered_slaves = filtered_sorted_slaves
Example #17
0
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings):
    mesos_state = get_mesos_master().state
    sfr = get_sfr(spotfleet_request_id, region=resource['region'])
    if not sfr or not sfr['SpotFleetRequestState'] == 'active':
        log.error(
            "Ignoring SFR {0} that does not exist or is not active.".format(
                spotfleet_request_id))
        return 0, 0
    sfr['ActiveInstances'] = get_spot_fleet_instances(
        spotfleet_request_id, region=resource['region'])
    resource['sfr'] = sfr
    desired_instances = len(sfr['ActiveInstances'])
    instance_ips = get_sfr_instance_ips(sfr, region=resource['region'])
    slaves = {
        slave['id']: slave
        for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips
        and slave['attributes'].get('pool', 'default') == resource['pool']
    }
    current_instances = len(slaves)
    log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" %
             (float(float(current_instances) / float(desired_instances)) * 100,
              current_instances, desired_instances))
    if float(current_instances) / desired_instances < (
            1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = (
            "We currently have %d instances active in mesos out of a desired %d.\n"
            "Refusing to scale because we either need to wait for the requests to be "
            "filled, or the new instances are not healthy for some reason.\n"
            "(cowardly refusing to go past %.2f%% missing instances)") % (
                current_instances, desired_instances,
                MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state)[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    target_utilization = pool_settings.get('target_utilization',
                                           DEFAULT_TARGET_UTILIZATION)
    error = utilization - target_utilization
    current, target = get_spot_fleet_delta(resource, error)
    return current, target
def check_registration(threshold_percentage):
    try:
        mesos_state = block(get_mesos_master().state)
    except MasterNotAvailableException as e:
        print("Could not find Mesos Master: %s" % e.message)
        sys.exit(1)

    config = load_system_paasta_config()
    autoscaling_resources = config.get_cluster_autoscaling_resources()
    for resource in autoscaling_resources.values():
        print("Checking %s" % resource["id"])
        try:
            scaler = get_scaler(resource["type"])(
                resource=resource,
                pool_settings=None,
                config_folder=None,
                dry_run=True,
                utilization_error=0.0,
                max_increase=0.0,
                max_decrease=0.0,
            )
        except KeyError:
            print("Couldn't find a metric provider for resource of type: {}".
                  format(resource["type"]))
            continue
        if len(scaler.instances) == 0:
            print("No instances for this resource")
            continue
        elif scaler.is_new_autoscaling_resource():
            # See OPS-13784
            threshold = config.get_monitoring_config().get(
                "check_registered_slave_threshold")
            print(f"Autoscaling resource was created within last {threshold}"
                  " seconds and would probably fail this check")
            continue
        else:
            slaves = scaler.get_aws_slaves(mesos_state)
            percent_registered = (
                float(float(len(slaves)) / float(len(scaler.instances))) * 100)
            if percent_registered < float(threshold_percentage):
                print(
                    "CRIT: Only found {}% of instances in {} registered in mesos. "
                    "Please check for puppet or AMI baking problems!".format(
                        percent_registered, resource["id"]))
                return False
    print(
        "OK: Found more than {}% of instances registered for all paasta resources in this "
        "superregion".format(threshold_percentage))
    return True
Example #19
0
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity,
                                 pool_settings, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param current_capacity: integer current SFR capacity
    :param target_capacity: target SFR capacity
    :param pool_settings: pool settings dict with timeout settings
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    target_capacity = int(target_capacity)
    current_capacity = int(current_capacity)
    delta = target_capacity - current_capacity
    sfr_id = resource['id']
    if delta == 0:
        log.info("Already at target capacity: {0}".format(target_capacity))
        return
    elif delta > 0:
        log.info(
            "Increasing spot fleet capacity to: {0}".format(target_capacity))
        set_spot_fleet_request_capacity(sfr_id,
                                        target_capacity,
                                        dry_run,
                                        region=resource['region'])
        return
    elif delta < 0:
        mesos_state = get_mesos_master().state_summary()
        slaves_list = get_mesos_task_count_by_slave(mesos_state,
                                                    pool=resource['pool'])
        filtered_slaves = filter_sfr_slaves(slaves_list, resource)
        killable_capacity = sum(
            [slave['instance_weight'] for slave in filtered_slaves])
        amount_to_decrease = delta * -1
        if amount_to_decrease > killable_capacity:
            log.error(
                "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!"
            )
            return
        downscale_spot_fleet_request(resource=resource,
                                     filtered_slaves=filtered_slaves,
                                     current_capacity=current_capacity,
                                     target_capacity=target_capacity,
                                     pool_settings=pool_settings,
                                     dry_run=dry_run)
Example #20
0
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings, config_folder, dry_run=False):
    sfr = get_sfr(spotfleet_request_id, region=resource['region'])
    if not sfr or sfr['SpotFleetRequestState'] == 'cancelled':
        log.error("SFR not found, removing config file.".format(spotfleet_request_id))
        cleanup_cancelled_sfr_config(spotfleet_request_id, config_folder, dry_run=dry_run)
        return 0, 0
    elif sfr['SpotFleetRequestState'] in ['cancelled_running', 'active']:
        sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id, region=resource['region'])
        resource['sfr'] = sfr
        desired_instances = len(sfr['ActiveInstances'])
        mesos_state = get_mesos_master().state
        slaves = get_sfr_slaves(resource, mesos_state)
        error = get_mesos_utilization_error(spotfleet_request_id,
                                            resource=resource,
                                            pool_settings=pool_settings,
                                            slaves=slaves,
                                            mesos_state=mesos_state,
                                            desired_instances=desired_instances)
    elif sfr['SpotFleetRequestState'] in ['submitted', 'modifying', 'cancelled_terminating']:
        log.warning("Not scaling an SFR in state: {0} so {1}, skipping...".format(sfr['SpotFleetRequestState'],
                                                                                  spotfleet_request_id))
        return 0, 0
    else:
        log.error("Unexpected SFR state: {0} for {1}".format(sfr['SpotFleetRequestState'],
                                                             spotfleet_request_id))
        raise ClusterAutoscalingError
    if is_aws_launching_sfr_instances(sfr) and sfr['SpotFleetRequestState'] == 'active':
        log.warning("AWS hasn't reached the TargetCapacity that is currently set. We won't make any "
                    "changes this time as we should wait for AWS to launch more instances first.")
        return 0, 0
    current, target = get_spot_fleet_delta(resource, error)
    if sfr['SpotFleetRequestState'] == 'cancelled_running':
        resource['min_capacity'] = 0
        slaves = get_pool_slaves(resource, mesos_state)
        pool_error = get_mesos_utilization_error(spotfleet_request_id,
                                                 resource=resource,
                                                 pool_settings=pool_settings,
                                                 slaves=slaves,
                                                 mesos_state=mesos_state)
        if pool_error > 0:
            log.info("Not scaling cancelled SFR {0} because we are under provisioned".format(spotfleet_request_id))
            return 0, 0
        current, target = get_spot_fleet_delta(resource, -1)
        if target == 1:
            target = 0
    return current, target
Example #21
0
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings, config_folder, dry_run=False):
    sfr = get_sfr(spotfleet_request_id, region=resource['region'])
    if not sfr or sfr['SpotFleetRequestState'] == 'cancelled':
        log.error("SFR not found, removing config file.".format(spotfleet_request_id))
        cleanup_cancelled_sfr_config(spotfleet_request_id, config_folder, dry_run=dry_run)
        return 0, 0
    elif sfr['SpotFleetRequestState'] in ['cancelled_running', 'active']:
        sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id, region=resource['region'])
        resource['sfr'] = sfr
        desired_instances = len(sfr['ActiveInstances'])
        mesos_state = get_mesos_master().state
        slaves = get_sfr_slaves(resource, mesos_state)
        error = get_mesos_utilization_error(spotfleet_request_id,
                                            resource=resource,
                                            pool_settings=pool_settings,
                                            slaves=slaves,
                                            mesos_state=mesos_state,
                                            desired_instances=desired_instances)
    elif sfr['SpotFleetRequestState'] in ['submitted', 'modifying', 'cancelled_terminating']:
        log.warning("Not scaling an SFR in state: {0} so {1}, skipping...".format(sfr['SpotFleetRequestState'],
                                                                                  spotfleet_request_id))
        return 0, 0
    else:
        log.error("Unexpected SFR state: {0} for {1}".format(sfr['SpotFleetRequestState'],
                                                             spotfleet_request_id))
        raise ClusterAutoscalingError
    if is_aws_launching_sfr_instances(sfr) and sfr['SpotFleetRequestState'] == 'active':
        log.warning("AWS hasn't reached the TargetCapacity that is currently set. We won't make any "
                    "changes this time as we should wait for AWS to launch more instances first.")
        return 0, 0
    current, target = get_spot_fleet_delta(resource, error)
    if sfr['SpotFleetRequestState'] == 'cancelled_running':
        resource['min_capacity'] = 0
        slaves = get_pool_slaves(resource, mesos_state)
        pool_error = get_mesos_utilization_error(spotfleet_request_id,
                                                 resource=resource,
                                                 pool_settings=pool_settings,
                                                 slaves=slaves,
                                                 mesos_state=mesos_state)
        if pool_error > 0:
            log.info("Not scaling cancelled SFR {0} because we are under provisioned".format(spotfleet_request_id))
            return 0, 0
        current, target = get_spot_fleet_delta(resource, -1)
        if target == 1:
            target = 0
    return current, target
Example #22
0
def check_mesos_active_frameworks() -> None:
    options = parse_args()
    expected = options.expected.split(',')
    master = get_mesos_master()
    try:
        state = block(master.state)
    except MasterNotAvailableException as e:
        paasta_print("CRITICAL: %s" % e.args[0])
        sys.exit(2)

    result = assert_frameworks_exist(state, expected)
    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print(result.message)
        sys.exit(2)
Example #23
0
def check_mesos_no_duplicate_frameworks() -> None:
    options = parse_args()
    check = options.check.split(",")
    master = get_mesos_master()
    try:
        state = block(master.state)
    except MasterNotAvailableException as e:
        print("CRITICAL: %s" % e.args[0])
        sys.exit(2)

    result = assert_no_duplicate_frameworks(state, check)
    if result.healthy:
        print("OK: " + result.message)
        sys.exit(0)
    else:
        print(result.message)
        sys.exit(2)
Example #24
0
def _clean_up_paasta_native_frameworks(context):
    clear_mesos_tools_cache()
    # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where
    # to connect to clean up paasta native frameworks.
    if hasattr(context, "etc_paasta"):
        for framework in a_sync.block(
                mesos_tools.get_mesos_master().frameworks, active_only=True):
            if framework.name.startswith(
                    "paasta_native ") or framework.name == getattr(
                        context, "framework_name", ""):
                print("cleaning up framework %s" % framework.name)
                try:
                    mesos_tools.terminate_framework(framework.id)
                except requests.exceptions.HTTPError as e:
                    print(
                        f"Got exception when terminating framework {framework.id}: {e}"
                    )
def autoscale_local_cluster(config_folder, dry_run=False, log_level=None):
    log.debug("Sleep 20s to throttle AWS API calls")
    time.sleep(20)
    if dry_run:
        log.info("Running in dry_run mode, no changes should be made")
    system_config = load_system_paasta_config()
    autoscaling_resources = system_config.get_cluster_autoscaling_resources()
    autoscaling_draining_enabled = system_config.get_cluster_autoscaling_draining_enabled(
    )
    all_pool_settings = system_config.get_resource_pool_settings()
    mesos_state = get_mesos_master().state
    utilization_errors = get_all_utilization_errors(autoscaling_resources,
                                                    all_pool_settings,
                                                    mesos_state)
    autoscaling_scalers = defaultdict(list)
    for identifier, resource in autoscaling_resources.items():
        pool_settings = all_pool_settings.get(resource['pool'], {})
        try:
            scaler = get_scaler(resource['type'])(
                resource=resource,
                pool_settings=pool_settings,
                config_folder=config_folder,
                dry_run=dry_run,
                log_level=log_level,
                utilization_error=utilization_errors[(resource['region'],
                                                      resource['pool'])],
                draining_enabled=autoscaling_draining_enabled,
            )
            autoscaling_scalers[(resource['region'],
                                 resource['pool'])].append(scaler)
        except KeyError:
            log.warning(
                "Couldn't find a metric provider for resource of type: {}".
                format(resource['type']))
            continue
        log.debug("Sleep 3s to throttle AWS API calls")
        time.sleep(3)
    filtered_autoscaling_scalers = filter_scalers(autoscaling_scalers,
                                                  utilization_errors)
    sorted_autoscaling_scalers = sort_scalers(filtered_autoscaling_scalers)
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(
        run_parallel_scalers(sorted_autoscaling_scalers, mesos_state))
    event_loop.close()
Example #26
0
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run):
    killed_slaves = 0
    while True:
        filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves)
        if len(filtered_sorted_slaves) == 0:
            log.info("ALL slaves killed so moving on to next pool!")
            break
        log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves]))
        filtered_sorted_slaves.reverse()
        slave_to_kill = filtered_sorted_slaves.pop()
        instance_capacity = slave_to_kill['instance_weight']
        new_capacity = current_capacity - instance_capacity
        if new_capacity < target_capacity:
            log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as"
                     " close to our target as we can get".format(slave_to_kill['instance_id'],
                                                                 slave_to_kill['instance_weight'],
                                                                 target_capacity))
            if killed_slaves == 0:
                log.info("This is a SFR so we must kill at least one slave to prevent the autoscaler "
                         "getting stuck whilst scaling down gradually")
            else:
                break
        try:
            gracefully_terminate_slave(resource=resource,
                                       slave_to_kill=slave_to_kill,
                                       pool_settings=pool_settings,
                                       current_capacity=current_capacity,
                                       new_capacity=new_capacity,
                                       dry_run=dry_run)
            killed_slaves += 1
        except HTTPError:
            # Something wrong draining host so try next host
            continue
        except FailSetSpotCapacity:
            break
        current_capacity = new_capacity
        mesos_state = get_mesos_master().state_summary()
        if filtered_sorted_slaves:
            filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves)
        else:
            filtered_slaves = filtered_sorted_slaves
Example #27
0
def resources_utilization(request):
    master = get_mesos_master()
    mesos_state = block(master.state)

    groupings = request.swagger_data.get('groupings', ['superregion'])
    # swagger actually makes the key None if it's not set
    if groupings is None:
        groupings = ['superregion']
    grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings)
    sorting_function = metastatus_lib.sort_func_for_attributes(groupings)

    filters = request.swagger_data.get('filter', [])
    filters = parse_filters(filters)
    filter_funcs = [
        metastatus_lib.make_filter_slave_func(attr, vals)
        for attr, vals in filters.items()
    ]

    resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=grouping_function,
        mesos_state=mesos_state,
        filters=filter_funcs,
        sort_func=sorting_function,
    )

    response_body = []
    for k, v in resource_info_dict.items():
        group = {'groupings': {}}
        for grouping, value in k:
            group['groupings'][grouping] = value
        for resource, value in v['total']._asdict().items():
            group[resource] = {'total': value}
        for resource, value in v['free']._asdict().items():
            group[resource]['free'] = value
        for resource in v['free']._fields:
            group[resource][
                'used'] = group[resource]['total'] - group[resource]['free']

        response_body.append(group)

    return Response(json_body=response_body, status_code=200)
Example #28
0
def unreserve_all_resources(hostnames):
    """Dynamically unreserve all available resources on the specified hosts
    :param hostnames: list of hostnames to unreserve resources on
    """
    mesos_state = get_mesos_master().state_summary()
    components = hostnames_to_components(hostnames)
    hosts = components_to_hosts(components)
    known_slaves = [slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts]
    for slave in known_slaves:
        hostname = slave['hostname']
        log.info("Unreserving all resources on %s" % hostname)
        slave_id = slave['id']
        resources = []
        for role in slave['reserved_resources']:
            for resource in ['disk', 'mem', 'cpus']:
                reserved_resource = slave['reserved_resources'][role][resource]
                resources.append(Resource(name=resource, amount=reserved_resource))
        try:
            unreserve(slave_id=slave_id, resources=resources)
        except HTTPError:
            raise HTTPError("Failed unreserving all of the resources on %s (%s). Aborting." % (hostname, slave_id))
def check_registration(threshold_percentage):
    mesos_state = get_mesos_master().state
    autoscaling_resources = load_system_paasta_config().get_cluster_autoscaling_resources()
    for resource in autoscaling_resources.values():
        if resource['type'] == 'aws_spot_fleet_request':
            resource['sfr'] = get_sfr(resource['id'], region=resource['region'])
            instances = get_spot_fleet_instances(resource['id'], region=resource['region'])
            resource['sfr']['ActiveInstances'] = instances
            slaves = get_sfr_slaves(resource, mesos_state)
            if len(instances) == 0:
                continue
            else:
                percent_registered = float(float(len(slaves)) / float(len(instances))) * 100
                if percent_registered < float(threshold_percentage):
                    print "CRIT: Only found {0}% of instances in {1} registered in mesos. "\
                          "Please check for puppet or AMI baking problems!".format(percent_registered,
                                                                                   resource['id'])
                    return False
    print "OK: Found more than {0}% of instances registered for all paasta resources in this "\
          "superregion".format(threshold_percentage)
    return True
Example #30
0
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run):
    killed_slaves = 0
    while True:
        filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves)
        if len(filtered_sorted_slaves) == 0:
            log.info("ALL slaves killed so moving on to next pool!")
            break
        log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves]))
        filtered_sorted_slaves.reverse()
        slave_to_kill = filtered_sorted_slaves.pop()
        instance_capacity = slave_to_kill['instance_weight']
        new_capacity = current_capacity - instance_capacity
        if new_capacity < target_capacity:
            log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as"
                     " close to our target as we can get".format(slave_to_kill['instance_id'],
                                                                 slave_to_kill['instance_weight'],
                                                                 target_capacity))
            if resource['sfr']['SpotFleetRequestState'] == 'cancelled_running' and killed_slaves == 0:
                log.info("This is a cancelled SFR so we must kill at least one slave to prevent it lingering")
            else:
                break
        try:
            gracefully_terminate_slave(resource=resource,
                                       slave_to_kill=slave_to_kill,
                                       pool_settings=pool_settings,
                                       current_capacity=current_capacity,
                                       new_capacity=new_capacity,
                                       dry_run=dry_run)
            killed_slaves += 1
        except HTTPError:
            # Something wrong draining host so try next host
            continue
        except FailSetSpotCapacity:
            break
        current_capacity = new_capacity
        mesos_state = get_mesos_master().state_summary()
        if filtered_sorted_slaves:
            filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves)
        else:
            filtered_slaves = filtered_sorted_slaves
Example #31
0
def reserve_all_resources(hostnames):
    """Dynamically reserve all available resources on the specified hosts
    :param hostnames: list of hostnames to reserve resources on
    """
    mesos_state = a_sync.block(get_mesos_master().state_summary)
    components = hostnames_to_components(hostnames)
    hosts = components_to_hosts(components)
    known_slaves = [slave for slave in mesos_state['slaves'] if slave['hostname'] in hosts]
    for slave in known_slaves:
        hostname = slave['hostname']
        log.info("Reserving all resources on %s" % hostname)
        slave_id = slave['id']
        resources = []
        for resource in ['disk', 'mem', 'cpus']:
            free_resource = slave['resources'][resource] - slave['used_resources'][resource]
            for role in slave['reserved_resources']:
                free_resource -= slave['reserved_resources'][role][resource]
            resources.append(Resource(name=resource, amount=free_resource))
        try:
            reserve(slave_id=slave_id, resources=resources)
        except HTTPError:
            raise HTTPError(f"Failed reserving all of the resources on {hostname} ({slave_id}). Aborting.")
Example #32
0
def check_mesos_no_duplicate_frameworks():
    master = get_mesos_master()
    try:
        state = master.state
    except MasterNotAvailableException as e:
        paasta_print("CRITICAL: %s" % e.message)
        sys.exit(2)

    system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)
    marathon_framework_ids = get_marathon_framework_ids(marathon_clients)
    result = assert_framework_count(
        state=state,
        marathon_framework_ids=marathon_framework_ids,
    )
    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print("CRITICAL: %s" % result.message)
        sys.exit(2)
Example #33
0
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, pool_settings, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param current_capacity: integer current SFR capacity
    :param target_capacity: target SFR capacity
    :param pool_settings: pool settings dict with timeout settings
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    target_capacity = int(target_capacity)
    delta = target_capacity - current_capacity
    sfr_id = resource['id']
    if delta == 0:
        log.info("Already at target capacity: {0}".format(target_capacity))
        return
    elif delta > 0:
        log.info("Increasing spot fleet capacity to: {0}".format(target_capacity))
        set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region'])
        return
    elif delta < 0:
        mesos_state = get_mesos_master().state_summary()
        slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=resource['pool'])
        filtered_slaves = filter_sfr_slaves(slaves_list, resource)
        killable_capacity = sum([slave['instance_weight'] for slave in filtered_slaves])
        amount_to_decrease = delta * -1
        if amount_to_decrease > killable_capacity:
            log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!")
            return
        downscale_spot_fleet_request(resource=resource,
                                     filtered_slaves=filtered_slaves,
                                     current_capacity=current_capacity,
                                     target_capacity=target_capacity,
                                     pool_settings=pool_settings,
                                     dry_run=dry_run)
Example #34
0
def print_output(argv: Optional[Sequence[str]] = None) -> None:
    mesos_available = is_mesos_available()
    kube_available = is_kubernetes_available()

    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    if mesos_available:
        master_kwargs = {}
        # we don't want to be passing False to not override a possible True
        # value from system config
        if args.use_mesos_cache:
            master_kwargs["use_mesos_cache"] = True

        master = get_mesos_master(**master_kwargs)

        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = all_marathon_clients(
            get_marathon_clients(marathon_servers))

        try:
            mesos_state = a_sync.block(master.state)
            all_mesos_results = _run_mesos_checks(mesos_master=master,
                                                  mesos_state=mesos_state)
        except MasterNotAvailableException as e:
            # if we can't connect to master at all,
            # then bomb out early
            paasta_print(PaastaColors.red("CRITICAL:  %s" % "\n".join(e.args)))
            raise FatalError(2)

        marathon_results = _run_marathon_checks(marathon_clients)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message="Marathon is not configured to run here", healthy=True)
        ]
        all_mesos_results = [
            metastatus_lib.HealthCheckResult(
                message="Mesos is not configured to run here", healthy=True)
        ]

    if kube_available:
        kube_client = KubeClient()
        kube_results = _run_kube_checks(kube_client)
    else:
        kube_results = [
            metastatus_lib.HealthCheckResult(
                message="Kubernetes is not configured to run here",
                healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    kube_ok = all(metastatus_lib.status_for_results(kube_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    kube_summary = metastatus_lib.generate_summary_for_check(
        "Kubernetes", kube_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok]) else False

    paasta_print(f"Master paasta_tools version: {__version__}")
    paasta_print("Mesos leader: %s" % get_mesos_leader())
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1 and mesos_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state(
            groupings=args.groupings,
            threshold=args.threshold,
            mesos_state=mesos_state)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = [headers] + [[
                str(x) for x in asi
            ] for asi in get_autoscaling_info_for_all_resources(mesos_state)]

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent("Per Slave Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            all_rows, _ = utilization_table_by_grouping_from_mesos_state(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                mesos_state=mesos_state,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be
            # 1 for per-slave resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok,
                                                  kube_results, args.verbose)
    if args.verbose > 1 and kube_available:
        print_with_indent(
            "Resources Grouped by %s" % ", ".join(args.groupings), 2)
        all_rows, healthy_exit = utilization_table_by_grouping_from_kube(
            groupings=args.groupings,
            threshold=args.threshold,
            kube_client=kube_client)
        for line in format_table(all_rows):
            print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("No autoscaling resources for Kubernetes", 2)

        if args.verbose >= 3:
            print_with_indent("Per Node Utilization", 2)
            cluster = system_paasta_config.get_cluster()
            service_instance_stats = get_service_instance_stats(
                args.service, args.instance, cluster)
            if service_instance_stats:
                print_with_indent(
                    "Service-Instance stats:" + str(service_instance_stats), 2)
            # print info about nodes here. Note that we don't make
            # modifications to the healthy_exit variable here, because we don't
            # care about a single node having high usage.
            all_rows, _ = utilization_table_by_grouping_from_kube(
                groupings=args.groupings + ["hostname"],
                threshold=args.threshold,
                kube_client=kube_client,
                service_instance_stats=service_instance_stats,
            )
            # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be
            # 1 for per-node resources, so delete it.
            for row in all_rows:
                row.pop()

            for line in format_table(all_rows):
                print_with_indent(line, 4)

    if not healthy_exit:
        raise FatalError(2)
def get_mesos_state():
    state = get_mesos_master(use_mesos_cache=True).state
    return state
Example #36
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state,
    )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics,
                                                                                mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here',
                                                             healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here',
                                                            healthy=True)]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    print "Master paasta_tools version: {0}".format(__version__)
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function,
                                                                                     mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'],
                                                                                      mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Example #37
0
def main(argv=None):
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

    try:
        mesos_state = master.state
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
            marathon_clients=marathon_clients,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function,
                mesos_state,
            )
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)',
                'GPU (used/total)', 'Agent count',
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ) + [str(resource_info_dict['slave_count'])])
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = functools.reduce(
                lambda x, y: x + [(y)],
                get_autoscaling_info_for_all_resources(mesos_state),
                [headers],
            )

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'],
                mesos_state,
            )
            all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)