Exemple #1
0
 def process_default(self, event):
     self.log.debug(event)
     self.watch_new_folder(event)
     event = self.filter_event(event)
     if event:
         self.log.debug("Public config changed on disk, loading new config")
         try:
             new_config = load_system_paasta_config()
         except ValueError:
             self.log.error("Couldn't load public config, the JSON is invalid!")
             return
         service_instances = []
         if new_config != self.public_config:
             self.log.info("Public config has changed, now checking if it affects any services config shas")
             self.public_config = new_config
             all_service_instances = get_services_for_cluster(cluster=self.public_config.get_cluster(),
                                                              instance_type='marathon',
                                                              soa_dir=DEFAULT_SOA_DIR)
             service_instances = get_service_instances_with_changed_id(self.marathon_client,
                                                                       all_service_instances,
                                                                       self.public_config.get_cluster())
         if service_instances:
             self.log.info("Found config change affecting {} service instances, "
                           "now doing a staggered bounce".format(len(service_instances)))
             bounce_rate = self.public_config.get_deployd_big_bounce_rate()
             service_instances = rate_limit_instances(instances=service_instances,
                                                      number_per_minute=bounce_rate,
                                                      watcher_name=self.__class__.__name__)
         for service_instance in service_instances:
             self.filewatcher.inbox_q.put(service_instance)
def main() -> None:
    system_paasta_config = load_system_paasta_config()

    kube_client = KubeClient()

    services = {
        service
        for service, instance in get_services_for_cluster(
            cluster=system_paasta_config.get_cluster(),
            instance_type="kubernetes")
    }

    for service in services:
        pscl = PaastaServiceConfigLoader(service=service,
                                         load_deployments=False)
        for instance_config in pscl.instance_configs(
                cluster=system_paasta_config.get_cluster(),
                instance_type_class=KubernetesDeploymentConfig,
        ):
            max_instances = instance_config.get_max_instances()
            if max_instances is not None:
                formatted_application = instance_config.format_kubernetes_app()
                formatted_application.spec.replicas = max_instances
                wrapper = get_application_wrapper(formatted_application)
                wrapper.soa_config = instance_config
                print(f"Scaling up {service}.{instance_config.instance}")
                wrapper.update(kube_client)
Exemple #3
0
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(instance_type="marathon",
                                         cluster=cluster,
                                         soa_dir=soa_dir)

    job_configs = dict()
    formatted_marathon_configs = dict()

    for service, instance in instances:
        try:
            job_config = load_marathon_service_config(service=service,
                                                      instance=instance,
                                                      cluster=cluster,
                                                      soa_dir=soa_dir)

            formatted_config = job_config.format_marathon_app_dict()
            formatted_marathon_configs[formatted_config["id"].lstrip(
                "/")] = formatted_config
            job_configs[formatted_config["id"].lstrip("/")] = job_config
        # Not ideal but we rely on a lot of user input to create the app dict
        # and we really can't afford to bail if just one app definition is malformed
        except Exception as errormsg:
            _log(
                service=service,
                line=str(errormsg),
                component="deploy",
                level="debug",
                cluster=cluster,
                instance=instance,
            )
    return formatted_marathon_configs, job_configs
def main():
    args = parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    service_instances = get_services_for_cluster(
        cluster=cluster,
        instance_type='marathon',
        soa_dir=args.soa_dir,
    )

    config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(config.get_url(),
                                                config.get_username(),
                                                config.get_password())
    all_tasks = client.list_tasks()
    mesos_slaves = get_slaves()
    smartstack_replication_checker = SmartstackReplicationChecker(
        mesos_slaves, system_paasta_config)
    for service, instance in service_instances:

        check_service_replication(
            service=service,
            instance=instance,
            cluster=cluster,
            all_tasks=all_tasks,
            soa_dir=args.soa_dir,
            smartstack_replication_checker=smartstack_replication_checker,
        )
Exemple #5
0
def get_paasta_native_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR):
    """A paasta_native-specific wrapper around utils.get_services_for_cluster

    :param cluster: The cluster to read the configuration for
    :param soa_dir: The SOA config directory to read from
    :returns: A list of tuples of (service, job_name)"""
    return get_services_for_cluster(cluster, 'paasta_native', soa_dir)
Exemple #6
0
def test_get_services_for_cluster():
    cluster = 'honey_bunches_of_oats'
    soa_dir = 'completely_wholesome'
    instances = [['this_is_testing', 'all_the_things'], ['my_nerf_broke']]
    expected = ['my_nerf_broke', 'this_is_testing', 'all_the_things']
    with contextlib.nested(
            mock.patch('os.path.abspath',
                       autospec=True,
                       return_value='chex_mix'),
            mock.patch('os.listdir',
                       autospec=True,
                       return_value=['dir1', 'dir2']),
            mock.patch('paasta_tools.utils.get_service_instance_list',
                       side_effect=lambda a, b, c, d: instances.pop()),
    ) as (
            abspath_patch,
            listdir_patch,
            get_instances_patch,
    ):
        actual = utils.get_services_for_cluster(cluster, soa_dir=soa_dir)
        assert expected == actual
        abspath_patch.assert_called_once_with(soa_dir)
        listdir_patch.assert_called_once_with('chex_mix')
        get_instances_patch.assert_any_call('dir1', cluster, None, soa_dir)
        get_instances_patch.assert_any_call('dir2', cluster, None, soa_dir)
        assert get_instances_patch.call_count == 2
Exemple #7
0
def cleanup_apps(soa_dir):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(),
                                                marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon',
                                              soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    for app_id in running_app_ids:
        log.debug("Checking app id %s", app_id)
        try:
            service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn(
                "%s doesn't conform to paasta naming conventions? Skipping." %
                app_id)
            continue
        if (service, instance) not in valid_services:
            delete_app(
                app_id=app_id,
                client=client,
                soa_dir=soa_dir,
            )
Exemple #8
0
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(
        instance_type='marathon',
        cluster=cluster,
        soa_dir=soa_dir,
    )
    marathon_configs = dict()

    for service, instance in instances:
        try:
            marathon_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            ).format_marathon_app_dict()
            marathon_configs[marathon_config['id'].lstrip(
                '/')] = marathon_config
        except NoSlavesAvailableError as errormsg:
            _log(
                service=service,
                line=errormsg,
                component='deploy',
                level='event',
                cluster=cluster,
                instance=instance,
            )
        except (NoDeploymentsAvailable, NoDockerImageError):
            pass
    return marathon_configs
Exemple #9
0
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    logging.basicConfig()
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    cluster = load_system_paasta_config().get_cluster()
    service_instances = get_services_for_cluster(
        cluster=cluster, instance_type='marathon', soa_dir=args.soa_dir)

    config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(config.get_url(), config.get_username(), config.get_password())
    for service, instance in service_instances:

        check_service_replication(
            client=client,
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
        )
def emit_metrics_for_type(instance_type):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(
        cluster=cluster,
        instance_type=instance_type,
    )

    for service, instance in instances:
        service_instance_config = get_instance_config(
            service=service,
            instance=instance,
            cluster=cluster,
        )
        dimensions = {
            'paasta_service': service_instance_config.service,
            'paasta_cluster': service_instance_config.cluster,
            'paasta_instance': service_instance_config.instance,
        }

        log.info(f"Emitting paasta.service.* with dimensions {dimensions}")
        gauge = yelp_meteorite.create_gauge('paasta.service.cpus', dimensions)
        gauge.set(service_instance_config.get_cpus())
        gauge = yelp_meteorite.create_gauge('paasta.service.mem', dimensions)
        gauge.set(service_instance_config.get_mem())
        gauge = yelp_meteorite.create_gauge('paasta.service.disk', dimensions)
        gauge.set(service_instance_config.get_disk())
        if hasattr(service_instance_config, 'get_instances'):
            if service_instance_config.get_max_instances() is None:
                gauge = yelp_meteorite.create_gauge('paasta.service.instances', dimensions)
                gauge.set(service_instance_config.get_instances())
def emit_metrics_for_type(instance_type):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(cluster=cluster, instance_type=instance_type)

    for service, instance in instances:
        service_instance_config = get_instance_config(
            service=service, instance=instance, cluster=cluster
        )
        dimensions = {
            "paasta_service": service_instance_config.service,
            "paasta_cluster": service_instance_config.cluster,
            "paasta_instance": service_instance_config.instance,
            "paasta_pool": service_instance_config.get_pool(),
        }

        log.info(f"Emitting paasta.service.* with dimensions {dimensions}")
        gauge = yelp_meteorite.create_gauge("paasta.service.cpus", dimensions)
        gauge.set(service_instance_config.get_cpus())
        gauge = yelp_meteorite.create_gauge("paasta.service.mem", dimensions)
        gauge.set(service_instance_config.get_mem())
        gauge = yelp_meteorite.create_gauge("paasta.service.disk", dimensions)
        gauge.set(service_instance_config.get_disk())
        if hasattr(service_instance_config, "get_instances"):
            if service_instance_config.get_max_instances() is None:
                gauge = yelp_meteorite.create_gauge(
                    "paasta.service.instances", dimensions
                )
                gauge.set(service_instance_config.get_instances())
Exemple #12
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    if args.minimal:
        marathon_config = load_marathon_config()
        marathon_client = get_marathon_client(
            url=marathon_config.get_url(),
            user=marathon_config.get_username(),
            passwd=marathon_config.get_password(),
        )
        service_instances = get_service_instances_that_need_bouncing(
            marathon_client=marathon_client,
            soa_dir=soa_dir,
        )
    else:
        instances = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
        service_instances = []
        for name, instance in instances:
            service_instances.append(compose_job_id(name, instance))
    paasta_print('\n'.join(service_instances))
    sys.exit(0)
def cleanup_apps(soa_dir):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    for app_id in running_app_ids:
        log.debug("Checking app id %s", app_id)
        try:
            service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        if (service, instance) not in valid_services:
            delete_app(
                app_id=app_id,
                client=client,
                soa_dir=soa_dir,
            )
Exemple #14
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances(
                ) and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                all_marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks(
                    '')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        if config.get_autoscaling_params(
                        )['decision_policy'] != 'bespoke':
                            try:
                                job_id = format_job_id(config.service,
                                                       config.instance)
                                marathon_tasks = {
                                    task.id: task
                                    for task in all_marathon_tasks
                                    if job_id == get_short_job_id(task.id)
                                    and task.health_check_results
                                }
                                if not marathon_tasks:
                                    raise MetricsProviderNoDataError(
                                        "Couldn't find any healthy marathon tasks"
                                    )
                                mesos_tasks = [
                                    task for task in all_mesos_tasks
                                    if task['id'] in marathon_tasks
                                ]
                                autoscale_marathon_instance(
                                    config, list(marathon_tasks.values()),
                                    mesos_tasks)
                            except Exception as e:
                                raise e
                                write_to_log(config=config,
                                             line='Caught Exception %s' % e)
    except LockHeldException:
        pass
Exemple #15
0
def get_chronos_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR):
    """A chronos-specific wrapper around utils.get_services_for_cluster

    :param cluster: The cluster to read the configuration for
    :param soa_dir: The SOA config directory to read from
    :returns: A list of tuples of (service, job_name)"""
    return get_services_for_cluster(cluster, 'chronos', soa_dir)
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR):
    services = get_services_for_cluster(
        cluster=cluster,
        instance_type='marathon',
        soa_dir=soa_dir,
    )
    configs = []
    for service, instance in services:
        try:
            service_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        except NoDeploymentsAvailable:
            log.debug(
                "%s is not deployed yet, refusing to do autoscaling calculations for it"
                % compose_job_id(service, instance))
            continue

        if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
            configs.append(service_config)

    return configs
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR):
    services = get_services_for_cluster(
        cluster=cluster,
        instance_type='marathon',
        soa_dir=soa_dir,
    )
    configs = []
    for service, instance in services:
        try:
            service_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        except NoDeploymentsAvailable:
            log.debug("%s is not deployed yet, refusing to do autoscaling calculations for it" %
                      compose_job_id(service, instance))
            continue

        if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
            configs.append(service_config)

    return configs
Exemple #18
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                mesos_tasks = get_running_tasks_from_active_frameworks('')
                for config in configs:
                    try:
                        autoscale_marathon_instance(config, marathon_tasks, mesos_tasks)
                    except Exception as e:
                        write_to_log(config=config, line='Caught Exception %s' % e, level='event')
    except LockHeldException:
        pass
def main():
    args = parse_args()
    instances = get_services_for_cluster(cluster=args.cluster, instance_type='marathon', soa_dir=args.soa_dir)
    composed = []
    for name, instance in instances:
        composed.append(compose_job_id(name, instance))
    print '\n'.join(composed)
    sys.exit(0)
def main():
    args = parse_args()
    instances = get_services_for_cluster(cluster=args.cluster, instance_type='marathon', soa_dir=args.soa_dir)
    composed = []
    for name, instance in instances:
        composed.append(compose_job_id(name, instance))
    print '\n'.join(composed)
    sys.exit(0)
Exemple #21
0
 def add_all_services(self):
     instances = get_services_for_cluster(cluster=self.config.get_cluster(),
                                          instance_type='marathon',
                                          soa_dir=DEFAULT_SOA_DIR)
     instances_to_add = rate_limit_instances(instances=instances,
                                             number_per_minute=self.config.get_deployd_startup_bounce_rate(),
                                             watcher_name='daemon_start')
     for service_instance in instances_to_add:
         self.inbox_q.put(service_instance)
Exemple #22
0
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password(),
    )

    valid_services = get_services_for_cluster(instance_type='marathon',
                                              soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    running_apps = []
    for app_id in running_app_ids:
        try:
            app_id = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn(
                "%s doesn't conform to paasta naming conventions? Skipping." %
                app_id)
            continue
        running_apps.append(app_id)
    apps_to_kill = [(service, instance, git_sha, config_sha)
                    for service, instance, git_sha, config_sha in running_apps
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % running_apps)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if running_apps:
        above_kill_threshold = float(len(apps_to_kill)) / float(
            len(running_apps)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical(
                "Paasta was about to kill more than %s of the running services, this "
                "is probably a BAD mistake!, run again with --force if you "
                "really need to destroy everything" % kill_threshold, )
            raise DontKillEverythingError
    for running_app in apps_to_kill:
        app_id = marathon_tools.format_job_id(*running_app)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )
Exemple #23
0
def validate_chronos(service_path):
    """Check that any chronos configurations are valid"""
    soa_dir, service = path_to_soa_dir_service(service_path)
    instance_type = 'chronos'
    chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER

    returncode = True

    if service.startswith(TMP_JOB_IDENTIFIER):
        paasta_print((
            "Services using scheduled tasks cannot be named %s, as it clashes with the "
            "identifier used for temporary jobs" % TMP_JOB_IDENTIFIER))
        return False
    for cluster in list_clusters(service, soa_dir, instance_type):
        services_in_cluster = get_services_for_cluster(cluster=cluster,
                                                       instance_type='chronos',
                                                       soa_dir=soa_dir)
        valid_services = {
            f"{name}{chronos_spacer}{instance}"
            for name, instance in services_in_cluster
        }
        for instance in list_all_instances_for_service(
                service=service,
                clusters=[cluster],
                instance_type=instance_type,
                soa_dir=soa_dir,
        ):
            cjc = load_chronos_job_config(service, instance, cluster, False,
                                          soa_dir)
            parents = cjc.get_parents() or []
            checks_passed, check_msgs = cjc.validate()

            for parent in parents:
                if not check_parent_format(parent):
                    continue
                if f"{service}{chronos_spacer}{instance}" == parent:
                    checks_passed = False
                    check_msgs.append("Job %s cannot depend on itself" %
                                      parent)
                elif parent not in valid_services:
                    checks_passed = False
                    check_msgs.append("Parent job %s could not be found" %
                                      parent)

            # Remove duplicate check_msgs
            unique_check_msgs = list(set(check_msgs))

            if not checks_passed:
                paasta_print(
                    invalid_chronos_instance(cluster, instance,
                                             "\n  ".join(unique_check_msgs)))
                returncode = False
            else:
                paasta_print(valid_chronos_instance(cluster, instance))
    return returncode
Exemple #24
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    instances = get_services_for_cluster(
        cluster=cluster, instance_type="kubernetes", soa_dir=soa_dir
    )
    service_instances = []
    for name, instance in instances:
        service_instances.append(compose_job_id(name, instance))
    paasta_print("\n".join(service_instances))
    sys.exit(0)
Exemple #25
0
def create_marathon_dashboard(
    cluster: str,
    soa_dir: str = DEFAULT_SOA_DIR,
    marathon_clients: MarathonClients = None,
    system_paasta_config: SystemPaastaConfig = None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(
        system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(
            marathon_servers=marathon_servers, cached=False)
    for service_instance in instances:
        service: str = service_instance[0]
        instance: str = service_instance[1]
        service_config: MarathonServiceConfig = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=cluster,
            load_deployments=False,
            soa_dir=soa_dir,
        )
        client: MarathonClient = marathon_clients.get_current_client_for_service(
            job_config=service_config)
        dashboard_links: Dict = system_paasta_config.get_dashboard_links()
        shard_url: str = client.servers[0]
        if 'Marathon RO' in dashboard_links[cluster]:
            marathon_links = dashboard_links[cluster]['Marathon RO']
            if isinstance(marathon_links, list):
                for shard_number, shard in enumerate(marathon_servers.current):
                    if shard.url[0] == shard_url:
                        shard_url = marathon_links[shard_number]
            elif isinstance(marathon_links, str):
                shard_url = marathon_links.split(' ')[0]
        service_info: Marathon_Dashboard_Item = {
            'service': service,
            'instance': instance,
            'shard_url': shard_url,
        }
        dashboard[cluster].append(service_info)
    return dashboard
Exemple #26
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                        and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        pass
Exemple #27
0
 def add_all_services(self):
     instances = get_services_for_cluster(
         cluster=self.config.get_cluster(),
         instance_type='marathon',
         soa_dir=DEFAULT_SOA_DIR,
     )
     instances_to_add = rate_limit_instances(
         instances=instances,
         cluster=self.config.get_cluster(),
         number_per_minute=self.config.get_deployd_startup_bounce_rate(),
         watcher_name='daemon_start',
         priority=99,
     )
     for service_instance in instances_to_add:
         self.instances_that_need_to_be_bounced_in_the_future.put(
             service_instance)
Exemple #28
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    instances = get_services_for_cluster(cluster=cluster,
                                         instance_type="kubernetes",
                                         soa_dir=soa_dir)
    service_instances = []
    for name, instance in instances:
        if args.sanitise:
            app_name = kubernetes_tools.get_kubernetes_app_name(name, instance)
        else:
            app_name = compose_job_id(name, instance)
        service_instances.append(app_name)
    print("\n".join(service_instances))
    sys.exit(0)
Exemple #29
0
 def process_default(self, event: pyinotify.Event) -> None:
     self.log.debug(event)
     self.watch_new_folder(event)
     event = self.filter_event(event)
     if event:
         self.log.debug(
             "Public config changed on disk, loading new config.")
         try:
             new_config = load_system_paasta_config()
         except ValueError:
             self.log.error(
                 "Couldn't load public config, the JSON is invalid!")
             return
         service_instance_configs: List[Tuple[str, str,
                                              MarathonServiceConfig,
                                              str]] = []
         if new_config != self.public_config:
             self.log.info(
                 "Public config has changed, now checking if it affects any services config shas."
             )
             self.public_config = new_config
             all_service_instances = get_services_for_cluster(
                 cluster=self.public_config.get_cluster(),
                 instance_type="marathon",
                 soa_dir=DEFAULT_SOA_DIR,
             )
             service_instance_configs = get_service_instances_needing_update(
                 self.marathon_clients,
                 all_service_instances,
                 self.public_config.get_cluster(),
             )
         if service_instance_configs:
             self.log.info(
                 f"{len(service_instance_configs)} service instances affected. Doing a staggered bounce."
             )
             for service, instance, config, _ in service_instance_configs:
                 self.filewatcher.instances_to_bounce.put(
                     ServiceInstance(
                         service=service,
                         instance=instance,
                         watcher=type(self).__name__,
                         bounce_by=time.time() + self.public_config.
                         get_deployd_big_bounce_deadline(),
                         wait_until=time.time(),
                         enqueue_time=time.time(),
                         bounce_start_time=time.time(),
                     ))
Exemple #30
0
def validate_chronos(service_path):
    """Check that any chronos configurations are valid"""
    soa_dir, service = path_to_soa_dir_service(service_path)
    instance_type = 'chronos'
    chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER

    returncode = True
    for cluster in list_clusters(service, soa_dir, instance_type):
        services_in_cluster = get_services_for_cluster(cluster=cluster,
                                                       instance_type='chronos',
                                                       soa_dir=soa_dir)
        valid_services = set([
            "%s%s%s" % (name, chronos_spacer, instance)
            for name, instance in services_in_cluster
        ])
        for instance in list_all_instances_for_service(
                service=service,
                clusters=[cluster],
                instance_type=instance_type,
                soa_dir=soa_dir):
            cjc = load_chronos_job_config(service, instance, cluster, False,
                                          soa_dir)
            parents = cjc.get_parents() or []
            checks_passed, check_msgs = cjc.validate()

            for parent in parents:
                if not check_parent_format(parent):
                    continue
                if "%s%s%s" % (service, chronos_spacer, instance) == parent:
                    checks_passed = False
                    check_msgs.append("Job %s cannot depend on itself" %
                                      parent)
                elif parent not in valid_services:
                    checks_passed = False
                    check_msgs.append("Parent job %s could not be found" %
                                      parent)

            # Remove duplicate check_msgs
            unique_check_msgs = list(set(check_msgs))

            if not checks_passed:
                print invalid_chronos_instance(cluster, instance,
                                               "\n  ".join(unique_check_msgs))
                returncode = False
            else:
                print valid_chronos_instance(cluster, instance)
    return returncode
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    running_apps = []
    for app_id in running_app_ids:
        try:
            app_id = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        running_apps.append(app_id)
    apps_to_kill = [(service, instance, git_sha, config_sha)
                    for service, instance, git_sha, config_sha in running_apps
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % running_apps)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if running_apps:
        above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical("Paasta was about to kill more than %s of the running services, this "
                         "is probably a BAD mistake!, run again with --force if you "
                         "really need to destroy everything" % kill_threshold)
            raise DontKillEverythingError
    for running_app in apps_to_kill:
        app_id = marathon_tools.format_job_id(*running_app)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )
Exemple #32
0
def main(sys_argv):
    args = parse_args(sys_argv[1:])
    cluster = load_system_paasta_config().get_cluster()
    victims = latest_oom_events(cluster, args.superregion)
    for (service, instance) in get_services_for_cluster(cluster, soa_dir=args.soa_dir):
        try:
            instance_config = get_instance_config(
                service=service,
                instance=instance,
                cluster=cluster,
                load_deployments=False,
                soa_dir=args.soa_dir,
            )
            oom_events = victims.get((service, instance), [])
            send_sensu_event(instance_config, oom_events, args)
        except NotImplementedError:  # When instance_type is not supported by get_instance_config
            pass
Exemple #33
0
def test_get_services_for_cluster():
    cluster = "honey_bunches_of_oats"
    soa_dir = "completely_wholesome"
    instances = [["this_is_testing", "all_the_things"], ["my_nerf_broke"]]
    expected = ["my_nerf_broke", "this_is_testing", "all_the_things"]
    with contextlib.nested(
        mock.patch("os.path.abspath", autospec=True, return_value="chex_mix"),
        mock.patch("os.listdir", autospec=True, return_value=["dir1", "dir2"]),
        mock.patch("paasta_tools.utils.get_service_instance_list", side_effect=lambda a, b, c, d: instances.pop()),
    ) as (abspath_patch, listdir_patch, get_instances_patch):
        actual = utils.get_services_for_cluster(cluster, soa_dir=soa_dir)
        assert expected == actual
        abspath_patch.assert_called_once_with(soa_dir)
        listdir_patch.assert_called_once_with("chex_mix")
        get_instances_patch.assert_any_call("dir1", cluster, None, soa_dir)
        get_instances_patch.assert_any_call("dir2", cluster, None, soa_dir)
        assert get_instances_patch.call_count == 2
Exemple #34
0
def cleanup_unused_apps(soa_dir: str,
                        kill_threshold: float = 0.5,
                        force: bool = False) -> None:
    """Clean up old or invalid jobs/apps from kubernetes. Retrieves
    both a list of apps currently in kubernetes and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Creating KubeClient")
    kube_client = KubeClient()

    log.info("Loading running Kubernetes apps")
    applications = list_namespaced_applications(kube_client, "paasta",
                                                APPLICATION_TYPES)

    log.info("Retrieving valid apps from yelpsoa_configs")
    valid_services = set(
        get_services_for_cluster(instance_type="kubernetes", soa_dir=soa_dir))

    log.info("Determining apps to be killed")
    applications_to_kill = [
        applicaton for applicaton in applications
        if (applicaton.kube_deployment.service,
            applicaton.kube_deployment.instance) not in valid_services
    ]

    log.debug("Running apps: %s" % applications)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % applications_to_kill)
    if applications_to_kill:
        above_kill_threshold = float(len(applications_to_kill)) / float(
            len(applications)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical(
                "Paasta was about to kill more than %s of the running services, this "
                "is probably a BAD mistake!, run again with --force if you "
                "really need to destroy everything" % kill_threshold)
            raise DontKillEverythingError

    for applicaton in applications_to_kill:
        with alert_state_change(applicaton, soa_dir):
            applicaton.deep_delete(kube_client)
def create_prometheus_adapter_config(paasta_cluster: str,
                                     soa_dir: Path) -> PrometheusAdapterConfig:
    """
    Given a paasta cluster and a soaconfigs directory, create the necessary Prometheus adapter
    config to autoscale services.
    Currently supports the following metrics providers:
        * uwsgi
    """
    rules: List[PrometheusAdapterRule] = []
    # get_services_for_cluster() returns a list of (service, instance) tuples, but this
    # is not great for us: if we were to iterate over that we'd end up getting duplicates
    # for every service as PaastaServiceConfigLoader does not expose a way to get configs
    # for a single instance by name. instead, we get the unique set of service names and then
    # let PaastaServiceConfigLoader iterate over instances for us later
    services = {
        service_name
        for service_name, _ in get_services_for_cluster(
            cluster=paasta_cluster,
            instance_type="kubernetes",
            soa_dir=str(soa_dir))
    }
    for service_name in services:
        config_loader = PaastaServiceConfigLoader(service=service_name,
                                                  soa_dir=str(soa_dir))
        for instance_config in config_loader.instance_configs(
                cluster=paasta_cluster,
                instance_type_class=KubernetesDeploymentConfig,
        ):
            rules.extend(
                get_rules_for_service_instance(
                    service_name=service_name,
                    instance_name=instance_config.instance,
                    autoscaling_config=instance_config.get_autoscaling_params(
                    ),
                    paasta_cluster=paasta_cluster,
                ))

    return {
        # we sort our rules so that we can easily compare between two different configmaps
        # as otherwise we'd need to do fancy order-independent comparisons between the two
        # sets of rules later due to the fact that we're not iterating in a deterministic
        # way and can add rules in any arbitrary order
        "rules": sorted(rules, key=lambda rule: rule["name"]["as"]),
    }
Exemple #36
0
 def process_default(self, event):
     self.log.debug(event)
     self.watch_new_folder(event)
     event = self.filter_event(event)
     if event:
         self.log.debug(
             "Public config changed on disk, loading new config.")
         try:
             new_config = load_system_paasta_config()
         except ValueError:
             self.log.error(
                 "Couldn't load public config, the JSON is invalid!")
             return
         service_instances: List[Tuple[str, str]] = []
         if new_config != self.public_config:
             self.log.info(
                 "Public config has changed, now checking if it affects any services config shas."
             )
             self.public_config = new_config
             all_service_instances = get_services_for_cluster(
                 cluster=self.public_config.get_cluster(),
                 instance_type='marathon',
                 soa_dir=DEFAULT_SOA_DIR,
             )
             service_instances = get_service_instances_needing_update(
                 self.marathon_clients,
                 all_service_instances,
                 self.public_config.get_cluster(),
             )
         if service_instances:
             self.log.info(
                 f"{len(service_instances)} service instances affected. Doing a staggered bounce."
             )
             bounce_rate = self.public_config.get_deployd_big_bounce_rate()
             for service_instance in rate_limit_instances(
                     instances=service_instances,
                     cluster=self.public_config.get_cluster(),
                     number_per_minute=bounce_rate,
                     watcher_name=type(self).__name__,
                     priority=99,
             ):
                 self.filewatcher.instances_that_need_to_be_bounced_in_the_future.put(
                     service_instance)
 def add_all_services(self) -> None:
     instances = get_services_for_cluster(
         cluster=self.config.get_cluster(),
         instance_type="marathon",
         soa_dir=DEFAULT_SOA_DIR,
     )
     for service, instance in instances:
         self.instances_to_bounce.put(
             ServiceInstance(
                 service=service,
                 instance=instance,
                 watcher="daemon_start",
                 bounce_by=time.time() +
                 self.config.get_deployd_startup_bounce_deadline(),
                 wait_until=time.time(),
                 failures=0,
                 bounce_start_time=time.time(),
                 enqueue_time=time.time(),
             ))
Exemple #38
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    if args.minimal:
        system_paasta_config = load_system_paasta_config()
        marathon_servers = get_marathon_servers(system_paasta_config)
        marathon_clients = get_marathon_clients(marathon_servers)
        service_instances = get_service_instances_that_need_bouncing(
            marathon_clients=marathon_clients, soa_dir=soa_dir)
    else:
        instances = get_services_for_cluster(cluster=cluster,
                                             instance_type="marathon",
                                             soa_dir=soa_dir)
        service_instances = []
        for name, instance in instances:
            service_instances.append(compose_job_id(name, instance))
    print("\n".join(service_instances))
    sys.exit(0)
Exemple #39
0
def validate_chronos(service_path):
    """Check that any chronos configurations are valid"""
    soa_dir, service = path_to_soa_dir_service(service_path)
    instance_type = 'chronos'
    chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER

    returncode = True

    if service.startswith(TMP_JOB_IDENTIFIER):
        print ("Services using scheduled tasks cannot be named %s, as it clashes with the"
               " identifier used for temporary jobs" % TMP_JOB_IDENTIFIER)
        return False
    for cluster in list_clusters(service, soa_dir, instance_type):
        services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type='chronos', soa_dir=soa_dir)
        valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster])
        for instance in list_all_instances_for_service(
                service=service, clusters=[cluster], instance_type=instance_type,
                soa_dir=soa_dir):
            cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir)
            parents = cjc.get_parents() or []
            checks_passed, check_msgs = cjc.validate()

            for parent in parents:
                if not check_parent_format(parent):
                    continue
                if "%s%s%s" % (service, chronos_spacer, instance) == parent:
                    checks_passed = False
                    check_msgs.append("Job %s cannot depend on itself" % parent)
                elif parent not in valid_services:
                    checks_passed = False
                    check_msgs.append("Parent job %s could not be found" % parent)

            # Remove duplicate check_msgs
            unique_check_msgs = list(set(check_msgs))

            if not checks_passed:
                print invalid_chronos_instance(cluster, instance, "\n  ".join(unique_check_msgs))
                returncode = False
            else:
                print valid_chronos_instance(cluster, instance)
    return returncode
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    if args.minimal:
        marathon_config = load_marathon_config()
        marathon_client = get_marathon_client(
            url=marathon_config.get_url(),
            user=marathon_config.get_username(),
            passwd=marathon_config.get_password(),
        )
        service_instances = get_service_instances_that_need_bouncing(
            marathon_client=marathon_client, soa_dir=soa_dir)
    else:
        instances = get_services_for_cluster(cluster=cluster,
                                             instance_type='marathon',
                                             soa_dir=soa_dir)
        service_instances = []
        for name, instance in instances:
            service_instances.append(compose_job_id(name, instance))
    print '\n'.join(service_instances)
    sys.exit(0)
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(
        instance_type='marathon',
        cluster=cluster,
        soa_dir=soa_dir,
    )
    marathon_configs = dict()

    for service, instance in instances:
        try:
            marathon_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            ).format_marathon_app_dict()
            marathon_configs[marathon_config['id'].lstrip('/')] = marathon_config
        except NoDockerImageError:
            # This service hasn't been deployed yet
            pass
    return marathon_configs
Exemple #42
0
def test_get_services_for_cluster():
    cluster = 'honey_bunches_of_oats'
    soa_dir = 'completely_wholesome'
    instances = [['this_is_testing', 'all_the_things'], ['my_nerf_broke']]
    expected = ['my_nerf_broke', 'this_is_testing', 'all_the_things']
    with contextlib.nested(
        mock.patch('os.path.abspath', autospec=True, return_value='chex_mix'),
        mock.patch('os.listdir', autospec=True, return_value=['dir1', 'dir2']),
        mock.patch('paasta_tools.utils.get_service_instance_list',
                   side_effect=lambda a, b, c, d: instances.pop()),
    ) as (
        abspath_patch,
        listdir_patch,
        get_instances_patch,
    ):
        actual = utils.get_services_for_cluster(cluster, soa_dir=soa_dir)
        assert expected == actual
        abspath_patch.assert_called_once_with(soa_dir)
        listdir_patch.assert_called_once_with('chex_mix')
        get_instances_patch.assert_any_call('dir1', cluster, None, soa_dir)
        get_instances_patch.assert_any_call('dir2', cluster, None, soa_dir)
        assert get_instances_patch.call_count == 2
Exemple #43
0
def validate_chronos(service_path):
    """Check that any chronos configurations are valid"""
    soa_dir, service = path_to_soa_dir_service(service_path)
    instance_type = "chronos"
    chronos_spacer = paasta_tools.chronos_tools.INTERNAL_SPACER

    returncode = True
    for cluster in list_clusters(service, soa_dir, instance_type):
        services_in_cluster = get_services_for_cluster(cluster=cluster, instance_type="chronos", soa_dir=soa_dir)
        valid_services = set(["%s%s%s" % (name, chronos_spacer, instance) for name, instance in services_in_cluster])
        for instance in list_all_instances_for_service(
            service=service, clusters=[cluster], instance_type=instance_type, soa_dir=soa_dir
        ):
            cjc = load_chronos_job_config(service, instance, cluster, False, soa_dir)
            parents = cjc.get_parents() or []
            checks_passed, check_msgs = cjc.validate()

            for parent in parents:
                if not check_parent_format(parent):
                    continue
                if "%s%s%s" % (service, chronos_spacer, instance) == parent:
                    checks_passed = False
                    check_msgs.append("Job %s cannot depend on itself" % parent)
                elif parent not in valid_services:
                    checks_passed = False
                    check_msgs.append("Parent job %s could not be found" % parent)

            # Remove duplicate check_msgs
            unique_check_msgs = list(set(check_msgs))

            if not checks_passed:
                print invalid_chronos_instance(cluster, instance, "\n  ".join(unique_check_msgs))
                returncode = False
            else:
                print valid_chronos_instance(cluster, instance)
    return returncode