def send_event(name, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param name: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        name,
        instance,
        cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides['check_every'] = '10s'
    # Most setup_marathon_job failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides['alert_after'] = '10m'
    check_name = 'setup_marathon_job.%s' % compose_job_id(name, instance)
    monitoring_tools.send_event(name, check_name, monitoring_overrides, status,
                                output, soa_dir)
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=namespace,
        cluster=cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    if 'alert_after' not in monitoring_overrides:
        monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['check_every'] = '1m'
    monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line='Replication: %s' % output,
        component='monitoring',
        level='debug',
        cluster=cluster,
        instance=namespace,
    )
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(service, namespace, cluster).get_monitoring()
    if "alert_after" not in monitoring_overrides:
        monitoring_overrides["alert_after"] = "2m"
    monitoring_overrides["check_every"] = "1m"
    monitoring_overrides["runbook"] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = "check_marathon_services_replication.%s" % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line="Replication: %s" % output,
        component="monitoring",
        level="debug",
        cluster=cluster,
        instance=namespace,
    )
Example #4
0
def notify_service_owners(
    services: Mapping[str, Sequence[EvictedPod]],
    soa_dir: str,
    dry_run: bool,
) -> None:
    check_overrides = {
        "page":
        False,
        "alert_after":
        "0m",
        "realert_every":
        1,
        "tip":
        "Pods can be Evicted if they go over the allowed quota for a given resource. Check the Eviction message to figure out which resource quota was breached",
    }
    for service in services.keys():
        check_name = f"pod-eviction.{service}"
        check_output = "The following pods have been evicted and will be removed from the cluster:\n"
        for pod in services[service]:
            check_output += f"- {pod.podname}: {pod.eviction_msg}\n"
        if dry_run:
            log.info(f"Would have notified owners for service {service}")
        else:
            log.info(f"Notifying owners for service {service}")
            send_event(
                service,
                check_name,
                check_overrides,
                Status.CRITICAL,
                check_output,
                soa_dir,
            )
Example #5
0
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir):
    """Send a Sensu event with a special ``ttl``, to let Sensu know that
    the everything is fine. This event is **not** fired when the bounce is in
    progress.

    If the bounce goes on for too long, this the ``ttl`` will expire and Sensu
    will emit a new event saying that this one didn't check in within the expected
    time-to-live."""
    ttl = '1h'
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    # Sensu currently emits events for expired ttl checks every 30s
    monitoring_overrides['check_every'] = '30s'
    monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting'
    monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, "
                                   "it may mean that the new version isn't healthy.")
    # Dogfooding this alert till I'm comfortable it doesn't spam people
    monitoring_overrides['team'] = 'noop'
    monitoring_overrides['notification_email'] = '*****@*****.**'

    monitoring_tools.send_event(
        service=service,
        check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance),
        overrides=monitoring_overrides,
        status=pysensu_yelp.Status.OK,
        output="The bounce is in a steady state",
        soa_dir=soa_dir,
        ttl=ttl,
    )
Example #6
0
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir):
    """Send a Sensu event with a special ``ttl``, to let Sensu know that
    the everything is fine. This event is **not** fired when the bounce is in
    progress.

    If the bounce goes on for too long, this the ``ttl`` will expire and Sensu
    will emit a new event saying that this one didn't check in within the expected
    time-to-live."""
    ttl = '1h'
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        load_deployments=False,
    ).get_monitoring()
    # Sensu currently emits events for expired ttl checks every 30s
    monitoring_overrides['check_every'] = '30s'
    monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting'
    monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, "
                                   "it may mean that the new version isn't healthy.")
    # Dogfooding this alert till I'm comfortable it doesn't spam people
    monitoring_overrides['team'] = 'noop'
    monitoring_overrides['notification_email'] = '*****@*****.**'

    monitoring_tools.send_event(
        service=service,
        check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance),
        overrides=monitoring_overrides,
        status=pysensu_yelp.Status.OK,
        output="The bounce is in a steady state",
        soa_dir=soa_dir,
        ttl=ttl,
    )
Example #7
0
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service, namespace, cluster).get_monitoring()
    if 'alert_after' not in monitoring_overrides:
        monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['check_every'] = '1m'
    monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line='Replication: %s' % output,
        component='monitoring',
        level='debug',
        cluster=cluster,
        instance=namespace,
    )
Example #8
0
def perform_security_check(args):
    """It runs a few security tests, checks the return code and prints output that should help in fixing failures.
    If you are at Yelp, please visit https://confluence.yelpcorp.com/display/PAASTA/PaaSTA+security-check+explained
    to learn more.
    :param args: service - the name of the service; commit - upstream git commit.
    :return: 0 if the security-check passed, non-zero if it failed.
    """
    security_check_command = load_system_paasta_config(
    ).get_security_check_command()
    if not security_check_command:
        paasta_print("Nothing to be executed during the security-check step")
        return 0

    ret_code, output = _run(security_check_command, timeout=300, stream=True)
    if ret_code != 0:
        paasta_print(
            "The security-check failed. Please visit y/security-check-runbook to learn how to fix it ("
            "including whitelisting safe versions of packages).")

    sensu_status = pysensu_yelp.Status.CRITICAL if ret_code != 0 else pysensu_yelp.Status.OK
    send_event(service=args.service,
               check_name='%s.security_check' % args.service,
               overrides={
                   'page': False,
                   'ticket': True
               },
               status=sensu_status,
               output=output,
               soa_dir=DEFAULT_SOA_DIR)

    return ret_code
def send_event(service, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides['check_every'] = '10s'
    # Most deploy_chronos_jobs failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides['alert_after'] = '10m'
    check_name = 'setup_chronos_job.%s' % compose_job_id(service, instance)
    monitoring_tools.send_event(
        service=service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status,
        output=output,
        soa_dir=soa_dir,
    )
Example #10
0
def send_event(service, instance, monitoring_overrides, soa_dir, status_code, message):
    check_name = compose_check_name_for_service_instance('check_chronos_jobs', service, instance)

    monitoring_tools.send_event(
        service=service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status_code,
        output=message,
        soa_dir=soa_dir,
    )
Example #11
0
def send_event(service, instance, monitoring_overrides, soa_dir, status_code, message):
    check_name = compose_check_name_for_service_instance('check_chronos_jobs', service, instance)

    monitoring_tools.send_event(
        service=service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status_code,
        output=message,
        soa_dir=soa_dir,
    )
Example #12
0
    def test_send_event_sensu_host_is_None(self):
        fake_service = 'fake_service'
        fake_monitoring_overrides = {}
        fake_check_name = 'fake_check_name'
        fake_status = '42'
        fake_output = 'The http port is not open'
        fake_soa_dir = '/fake/soa/dir'
        self.fake_cluster = 'fake_cluster'
        fake_sensu_port = 12345

        with mock.patch(
                "paasta_tools.monitoring_tools.get_team",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_tip",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_notification_email",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_irc_channels",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_ticket",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_project",
                autospec=True,
        ), mock.patch(
                "paasta_tools.monitoring_tools.get_page",
                autospec=True,
        ), mock.patch(
                "pysensu_yelp.send_event",
                autospec=True,
        ) as pysensu_yelp_send_event_patch, mock.patch(
                'paasta_tools.monitoring_tools.load_system_paasta_config',
                autospec=True,
        ) as load_system_paasta_config_patch:
            load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(
                return_value=None)
            load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(
                return_value=fake_sensu_port)

            monitoring_tools.send_event(
                fake_service,
                fake_check_name,
                fake_monitoring_overrides,
                fake_status,
                fake_output,
                fake_soa_dir,
            )

            assert pysensu_yelp_send_event_patch.call_count == 0
Example #13
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Example #14
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Example #15
0
def send_sensu_event(instance, oom_events, args):
    """
    :param instance: InstanceConfig
    :param oom_events: a list of OOMEvents
    """
    check_name = compose_check_name_for_service_instance(
        "oom-killer", instance.service, instance.instance)
    monitoring_overrides = instance.get_monitoring()
    status = compose_sensu_status(
        instance=instance,
        oom_events=oom_events,
        is_check_enabled=monitoring_overrides.get("check_oom_events", True),
    )
    monitoring_overrides.update({
        "page":
        False,
        "ticket":
        False,
        "alert_after":
        "0m",
        "realert_every":
        args.realert_every,
        "runbook":
        "y/check-oom-events",
        "tip":
        "Try bumping the memory limit past %dMB" % instance.get_mem(),
    })
    return monitoring_tools.send_event(
        service=instance.service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status[0],
        output=status[1],
        soa_dir=instance.soa_dir,
    )
Example #16
0
def update_check_status(service, output, status):
    overrides = {
        "page": False,
        "alert_after": 0,
        "tip": "Ask the user to check the job UI and terminate the job if appropriate.",
        "runbook": "http://y/spark-debug",
        "ticket": True,
    }
    send_event(
        service=service,
        check_name=f"long_running_spark_jobs.{service}",
        overrides=overrides,
        status=status,
        output=output,
        soa_dir=DEFAULT_SOA_DIR,
    )
Example #17
0
def update_check_status(service, output, status):
    overrides = {
        'page': False,
        'alert_after': 0,
        'tip': 'Ask the user to check the job UI and terminate the job if appropriate.',
        'runbook': 'http://y/spark-debug',
        'ticket': True,
    }
    send_event(
        service=service,
        check_name=f'long_running_spark_jobs.{service}',
        overrides=overrides,
        status=status,
        output=output,
        soa_dir=DEFAULT_SOA_DIR,
    )
Example #18
0
def send_sensu_event(instance, oom_events, args):
    """
    :param instance: InstanceConfig
    :param oom_events: a list of OOMEvents
    """
    check_name = compose_check_name_for_service_instance(
        'oom-killer',
        instance.service,
        instance.instance,
    )
    monitoring_overrides = instance.get_monitoring()
    status = compose_sensu_status(
        instance=instance,
        oom_events=oom_events,
        is_check_enabled=monitoring_overrides.get('check_oom_events', True),
    )
    monitoring_overrides.update({
        'team': 'noop',  # TODO: remove after testing
        'page': False,
        'ticket': False,
        'alert_after': '0m',
        'realert_every': args.realert_every,
        'notification_email': False,  # TODO: remove after testing
        'irc_channels': ['#oom-test'],  # TODO: remove after testing
        'runbook': 'http://y/none',  # TODO: needs a link
        'tip': 'Try bumping the memory limit past %dMB' % instance.get_mem(),
    })
    return monitoring_tools.send_event(
        service=instance.service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status[0],
        output=status[1],
        soa_dir=instance.soa_dir,
    )
Example #19
0
def send_sensu_event(instance, oom_events, args):
    """
    :param instance: InstanceConfig
    :param oom_events: a list of OOMEvents
    """
    check_name = compose_check_name_for_service_instance(
        'oom-killer',
        instance.service,
        instance.instance,
    )
    monitoring_overrides = instance.get_monitoring()
    status = compose_sensu_status(
        instance=instance,
        oom_events=oom_events,
        is_check_enabled=monitoring_overrides.get('check_oom_events', True),
    )
    monitoring_overrides.update({
        'page': False,
        'ticket': False,
        'alert_after': '0m',
        'realert_every': args.realert_every,
        'runbook': 'y/check-oom-events',
        'tip': 'Try bumping the memory limit past %dMB' % instance.get_mem(),
    })
    return monitoring_tools.send_event(
        service=instance.service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status[0],
        output=status[1],
        soa_dir=instance.soa_dir,
    )
Example #20
0
    def test_send_event_sensu_host_is_None(self):
        fake_service = 'fake_service'
        fake_monitoring_overrides = {}
        fake_check_name = 'fake_check_name'
        fake_status = '42'
        fake_output = 'The http port is not open'
        fake_soa_dir = '/fake/soa/dir'
        self.fake_cluster = 'fake_cluster'
        fake_sensu_port = 12345

        with contextlib.nested(
            mock.patch("paasta_tools.monitoring_tools.get_team", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_tip", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_notification_email", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_irc_channels", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_ticket", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_project", autospec=True),
            mock.patch("paasta_tools.monitoring_tools.get_page", autospec=True),
            mock.patch("pysensu_yelp.send_event", autospec=True),
            mock.patch('paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True),
        ) as (
            get_team_patch,
            get_tip_patch,
            get_notification_email_patch,
            get_irc_patch,
            get_ticket_patch,
            get_project_patch,
            get_page_patch,
            pysensu_yelp_send_event_patch,
            load_system_paasta_config_patch,
        ):
            load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(return_value=None)
            load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(return_value=fake_sensu_port)

            monitoring_tools.send_event(
                fake_service,
                fake_check_name,
                fake_monitoring_overrides,
                fake_status,
                fake_output,
                fake_soa_dir
            )

            assert pysensu_yelp_send_event_patch.call_count == 0
def send_event(name, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param name: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        name, instance, cluster, load_deployments=False
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides["check_every"] = "10s"
    # Most setup_marathon_job failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides["alert_after"] = "10m"
    check_name = "setup_marathon_job.%s" % compose_job_id(name, instance)
    monitoring_tools.send_event(name, check_name, monitoring_overrides, status, output, soa_dir)
def send_event(instance_config, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param instance_config: an instance of MarathonServiceConfig
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = instance_config.get_monitoring()
    if 'alert_after' not in monitoring_overrides:
        monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['check_every'] = '1m'
    monitoring_overrides['runbook'] = monitoring_tools.get_runbook(
        monitoring_overrides,
        instance_config.service, soa_dir=instance_config.soa_dir,
    )

    check_name = (
        'check_marathon_services_replication.%s' %
        instance_config.job_id
    )
    monitoring_tools.send_event(
        service=instance_config.service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status,
        output=output,
        soa_dir=instance_config.soa_dir,
        cluster=instance_config.cluster,
    )
    _log(
        service=instance_config.service,
        line='Replication: %s' % output,
        component='monitoring',
        level='debug',
        cluster=instance_config.cluster,
        instance=instance_config.instance,
    )
Example #23
0
def send_sensu_event(instance, oom_events, args):
    """
    :param instance: InstanceConfig
    :param oom_events: a list of OOMEvents
    """
    check_name = compose_check_name_for_service_instance(
        "oom-killer", instance.service, instance.instance)
    monitoring_overrides = instance.get_monitoring()
    status = compose_sensu_status(
        instance=instance,
        oom_events=oom_events,
        is_check_enabled=monitoring_overrides.get("check_oom_events", True),
        alert_threshold=args.alert_threshold,
        check_interval=args.check_interval,
    )
    if not status:
        return

    memory_limit = instance.get_mem()
    try:
        memory_limit_str = f"{int(memory_limit)}MB"
    except ValueError:
        memory_limit_str = memory_limit

    monitoring_overrides.update({
        "page":
        False,
        "alert_after":
        "0m",
        "realert_every":
        args.realert_every,
        "runbook":
        "y/check-oom-events",
        "tip": ("Follow the runbook to investigate and rightsize memory usage "
                f"(curr: {memory_limit_str})"),
    })
    return monitoring_tools.send_event(
        service=instance.service,
        check_name=check_name,
        overrides=monitoring_overrides,
        status=status[0],
        output=status[1],
        soa_dir=instance.soa_dir,
        dry_run=args.dry_run,
    )
Example #24
0
def send_event(chronos_job_config, status_code, output):
    """Compose monitoring overrides and send the evernt to sensu.

    :param chronos_job_config: an instance of ChronosJobConfig
    :param status_code: Sensu status code
    :param output: An event message
    """
    monitoring_overrides = compose_monitoring_overrides_for_service(chronos_job_config)

    return monitoring_tools.send_event(
        service=chronos_job_config.service,
        check_name=check_chronos_job_name(
            chronos_job_config.service,
            chronos_job_config.instance,
        ),
        overrides=monitoring_overrides,
        status=status_code,
        output=add_realert_status(output, monitoring_overrides.get('realert_every')),
        soa_dir=chronos_job_config.soa_dir,
    )
Example #25
0
    def test_send_event(self):
        fake_service = 'fake_service'
        fake_monitoring_overrides = {}
        fake_check_name = 'fake_check_name'
        fake_status = '42'
        fake_output = 'The http port is not open'
        fake_team = 'fake_team'
        fake_tip = 'fake_tip'
        fake_notification_email = 'fake@notify'
        fake_irc = '#fake'
        fake_soa_dir = '/fake/soa/dir'
        self.fake_cluster = 'fake_cluster'
        fake_sensu_host = 'fake_sensu_host'
        fake_sensu_port = 12345
        expected_runbook = 'http://y/paasta-troubleshooting'
        expected_check_name = fake_check_name
        expected_kwargs = {
            'tip': fake_tip,
            'notification_email': fake_notification_email,
            'irc_channels': fake_irc,
            'project': None,
            'ticket': False,
            'page': True,
            'alert_after': '5m',
            'check_every': '1m',
            'realert_every': -1,
            'source': 'paasta-fake_cluster',
            'ttl': None,
        }
        with contextlib.nested(
            mock.patch(
                "paasta_tools.monitoring_tools.get_team",
                return_value=fake_team,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_tip",
                return_value=fake_tip,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_notification_email",
                return_value=fake_notification_email,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_irc_channels",
                return_value=fake_irc,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_ticket",
                return_value=False,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_project",
                return_value=None,
                autospec=True,
            ),
            mock.patch(
                "paasta_tools.monitoring_tools.get_page",
                return_value=True,
                autospec=True,
            ),
            mock.patch("pysensu_yelp.send_event", autospec=True),
            mock.patch('paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True),
        ) as (
            get_team_patch,
            get_tip_patch,
            get_notification_email_patch,
            get_irc_patch,
            get_ticket_patch,
            get_project_patch,
            get_page_patch,
            pysensu_yelp_send_event_patch,
            load_system_paasta_config_patch,
        ):
            load_system_paasta_config_patch.return_value.get_cluster = mock.Mock(return_value=self.fake_cluster)
            load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(return_value=fake_sensu_host)
            load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(return_value=fake_sensu_port)

            monitoring_tools.send_event(
                fake_service,
                fake_check_name,
                fake_monitoring_overrides,
                fake_status,
                fake_output,
                fake_soa_dir
            )

            get_team_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir,
            )
            get_tip_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir
            )
            get_notification_email_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir
            )
            get_irc_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir
            )
            get_page_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir
            )
            pysensu_yelp_send_event_patch.assert_called_once_with(
                expected_check_name,
                expected_runbook,
                fake_status,
                fake_output,
                fake_team,
                sensu_host=fake_sensu_host,
                sensu_port=fake_sensu_port,
                **expected_kwargs
            )
            load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with()
Example #26
0
    def test_send_event(self):
        fake_service = 'fake_service'
        fake_monitoring_overrides = {}
        fake_check_name = 'fake_check_name'
        fake_status = '42'
        fake_output = 'The http port is not open'
        fake_team = 'fake_team'
        fake_tip = 'fake_tip'
        fake_notification_email = 'fake@notify'
        fake_irc = '#fake'
        fake_soa_dir = '/fake/soa/dir'
        self.fake_cluster = 'fake_cluster'
        fake_sensu_host = 'fake_sensu_host'
        fake_sensu_port = 12345
        expected_runbook = 'http://y/paasta-troubleshooting'
        expected_check_name = fake_check_name
        expected_kwargs = {
            'tip': fake_tip,
            'notification_email': fake_notification_email,
            'irc_channels': fake_irc,
            'project': None,
            'ticket': False,
            'page': True,
            'alert_after': '5m',
            'check_every': '1m',
            'realert_every': -1,
            'source': 'paasta-fake_cluster',
            'ttl': None,
        }
        with contextlib.nested(
                mock.patch(
                    "paasta_tools.monitoring_tools.get_team",
                    return_value=fake_team,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_tip",
                    return_value=fake_tip,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_notification_email",
                    return_value=fake_notification_email,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_irc_channels",
                    return_value=fake_irc,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_ticket",
                    return_value=False,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_project",
                    return_value=None,
                    autospec=True,
                ),
                mock.patch(
                    "paasta_tools.monitoring_tools.get_page",
                    return_value=True,
                    autospec=True,
                ),
                mock.patch("pysensu_yelp.send_event", autospec=True),
                mock.patch(
                    'paasta_tools.monitoring_tools.load_system_paasta_config',
                    autospec=True),
        ) as (
                get_team_patch,
                get_tip_patch,
                get_notification_email_patch,
                get_irc_patch,
                get_ticket_patch,
                get_project_patch,
                get_page_patch,
                pysensu_yelp_send_event_patch,
                load_system_paasta_config_patch,
        ):
            load_system_paasta_config_patch.return_value.get_cluster = mock.Mock(
                return_value=self.fake_cluster)
            load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(
                return_value=fake_sensu_host)
            load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(
                return_value=fake_sensu_port)

            monitoring_tools.send_event(fake_service, fake_check_name,
                                        fake_monitoring_overrides, fake_status,
                                        fake_output, fake_soa_dir)

            get_team_patch.assert_called_once_with(
                fake_monitoring_overrides,
                fake_service,
                fake_soa_dir,
            )
            get_tip_patch.assert_called_once_with(fake_monitoring_overrides,
                                                  fake_service, fake_soa_dir)
            get_notification_email_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir)
            get_irc_patch.assert_called_once_with(fake_monitoring_overrides,
                                                  fake_service, fake_soa_dir)
            get_page_patch.assert_called_once_with(fake_monitoring_overrides,
                                                   fake_service, fake_soa_dir)
            pysensu_yelp_send_event_patch.assert_called_once_with(
                expected_check_name,
                expected_runbook,
                fake_status,
                fake_output,
                fake_team,
                sensu_host=fake_sensu_host,
                sensu_port=fake_sensu_port,
                **expected_kwargs)
            load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with(
            )
    def test_send_event(self):
        fake_service = "fake_service"
        fake_monitoring_overrides = {}
        fake_check_name = "fake_check_name"
        fake_status = "42"
        fake_output = "The http port is not open"
        fake_team = "fake_team"
        fake_tip = "fake_tip"
        fake_notification_email = "fake@notify"
        fake_irc = "#fake"
        fake_slack = "#fake_slack"
        fake_soa_dir = "/fake/soa/dir"
        self.fake_cluster = "fake_cluster"
        fake_sensu_host = "fake_sensu_host"
        fake_sensu_port = 12345
        expected_runbook = "http://y/paasta-troubleshooting"
        expected_check_name = fake_check_name
        expected_kwargs = {
            "name": expected_check_name,
            "runbook": expected_runbook,
            "status": fake_status,
            "output": fake_output,
            "team": fake_team,
            "page": True,
            "tip": fake_tip,
            "notification_email": fake_notification_email,
            "check_every": "1m",
            "realert_every": -1,
            "alert_after": "5m",
            "irc_channels": fake_irc,
            "slack_channels": fake_slack,
            "ticket": False,
            "project": None,
            "priority": None,
            "source": "paasta-fake_cluster",
            "tags": [],
            "ttl": None,
            "sensu_host": fake_sensu_host,
            "sensu_port": fake_sensu_port,
            "component": None,
            "description": None,
        }
        with mock.patch(
            "paasta_tools.monitoring_tools.get_team",
            return_value=fake_team,
            autospec=True,
        ) as get_team_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_tip",
            return_value=fake_tip,
            autospec=True,
        ) as get_tip_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_notification_email",
            return_value=fake_notification_email,
            autospec=True,
        ) as get_notification_email_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_irc_channels",
            return_value=fake_irc,
            autospec=True,
        ) as get_irc_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_slack_channels",
            return_value=fake_slack,
            autospec=True,
        ) as get_slack_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_ticket",
            return_value=False,
            autospec=True,
        ), mock.patch(
            "paasta_tools.monitoring_tools.get_project",
            return_value=None,
            autospec=True,
        ), mock.patch(
            "paasta_tools.monitoring_tools.get_page", return_value=True, autospec=True
        ) as get_page_patch, mock.patch(
            "paasta_tools.monitoring_tools.get_priority",
            return_value=None,
            autospec=True,
        ), mock.patch(
            "paasta_tools.monitoring_tools.get_tags", return_value=[], autospec=True
        ), mock.patch(
            "paasta_tools.monitoring_tools.get_component",
            return_value=None,
            autospec=True,
        ), mock.patch(
            "paasta_tools.monitoring_tools.get_description",
            return_value=None,
            autospec=True,
        ), mock.patch(
            "pysensu_yelp.send_event", autospec=True
        ) as pysensu_yelp_send_event_patch, mock.patch(
            "paasta_tools.monitoring_tools.load_system_paasta_config", autospec=True
        ) as load_system_paasta_config_patch:
            load_system_paasta_config_patch.return_value.get_cluster = mock.Mock(
                return_value=self.fake_cluster
            )
            load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(
                return_value=fake_sensu_host
            )
            load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(
                return_value=fake_sensu_port
            )

            monitoring_tools.send_event(
                fake_service,
                fake_check_name,
                fake_monitoring_overrides,
                fake_status,
                fake_output,
                fake_soa_dir,
            )

            get_team_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            get_tip_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            get_notification_email_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            get_irc_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            get_slack_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            get_page_patch.assert_called_once_with(
                fake_monitoring_overrides, fake_service, fake_soa_dir
            )
            pysensu_yelp_send_event_patch.assert_called_once_with(**expected_kwargs)
            load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with()
Example #28
0
def main():

    args = parse_args()
    soa_dir = args.soa_dir

    config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(config)

    system_paasta_config = utils.load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    running_jobs = set(deployed_job_names(client))

    expected_service_jobs = {chronos_tools.compose_job_id(*job) for job in
                             chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)}

    all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs)))
    expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir))
    valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs

    to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs

    task_responses = cleanup_tasks(client, to_delete)
    task_successes = []
    task_failures = []
    for response in task_responses:
        if isinstance(response[-1], Exception):
            task_failures.append(response)
        else:
            task_successes.append(response)

    job_responses = cleanup_jobs(client, to_delete)
    job_successes = []
    job_failures = []
    for response in job_responses:
        if isinstance(response[-1], Exception):
            job_failures.append(response)
        else:
            job_successes.append(response)
            try:
                (service, instance) = chronos_tools.decompose_job_id(response[0])
                monitoring_tools.send_event(
                    check_name=check_chronos_job_name(service, instance),
                    service=service,
                    overrides={},
                    soa_dir=soa_dir,
                    status=pysensu_yelp.Status.OK,
                    output="This instance was removed and is no longer supposed to be scheduled.",
                )
            except InvalidJobNameError:
                # If we deleted some bogus job with a bogus jobid that could not be parsed,
                # Just move on, no need to send any kind of paasta event.
                pass

    if len(to_delete) == 0:
        paasta_print('No Chronos Jobs to remove')
    else:
        if len(task_successes) > 0:
            paasta_print(format_list_output(
                "Successfully Removed Tasks (if any were running) for:",
                [job[0] for job in task_successes],
            ))

        # if there are any failures, print and exit appropriately
        if len(task_failures) > 0:
            paasta_print(format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]))

        if len(job_successes) > 0:
            paasta_print(format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]))

        # if there are any failures, print and exit appropriately
        if len(job_failures) > 0:
            paasta_print(format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]))

        if len(job_failures) > 0 or len(task_failures) > 0:
            sys.exit(1)