def send_event(name, instance, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param name: The service name the event is about :param instance: The instance of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event """ cluster = load_system_paasta_config().get_cluster() monitoring_overrides = marathon_tools.load_marathon_service_config( name, instance, cluster, soa_dir=soa_dir, load_deployments=False, ).get_monitoring() # In order to let sensu know how often to expect this check to fire, # we need to set the ``check_every`` to the frequency of our cron job, which # is 10s. monitoring_overrides['check_every'] = '10s' # Most setup_marathon_job failures are transient and represent issues # that will probably be fixed eventually, so we set an alert_after # to suppress extra noise monitoring_overrides['alert_after'] = '10m' check_name = 'setup_marathon_job.%s' % compose_job_id(name, instance) monitoring_tools.send_event(name, check_name, monitoring_overrides, status, output, soa_dir)
def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config( service=service, instance=namespace, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ).get_monitoring() if 'alert_after' not in monitoring_overrides: monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line='Replication: %s' % output, component='monitoring', level='debug', cluster=cluster, instance=namespace, )
def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config(service, namespace, cluster).get_monitoring() if "alert_after" not in monitoring_overrides: monitoring_overrides["alert_after"] = "2m" monitoring_overrides["check_every"] = "1m" monitoring_overrides["runbook"] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = "check_marathon_services_replication.%s" % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line="Replication: %s" % output, component="monitoring", level="debug", cluster=cluster, instance=namespace, )
def notify_service_owners( services: Mapping[str, Sequence[EvictedPod]], soa_dir: str, dry_run: bool, ) -> None: check_overrides = { "page": False, "alert_after": "0m", "realert_every": 1, "tip": "Pods can be Evicted if they go over the allowed quota for a given resource. Check the Eviction message to figure out which resource quota was breached", } for service in services.keys(): check_name = f"pod-eviction.{service}" check_output = "The following pods have been evicted and will be removed from the cluster:\n" for pod in services[service]: check_output += f"- {pod.podname}: {pod.eviction_msg}\n" if dry_run: log.info(f"Would have notified owners for service {service}") else: log.info(f"Notifying owners for service {service}") send_event( service, check_name, check_overrides, Status.CRITICAL, check_output, soa_dir, )
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir): """Send a Sensu event with a special ``ttl``, to let Sensu know that the everything is fine. This event is **not** fired when the bounce is in progress. If the bounce goes on for too long, this the ``ttl`` will expire and Sensu will emit a new event saying that this one didn't check in within the expected time-to-live.""" ttl = '1h' monitoring_overrides = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ).get_monitoring() # Sensu currently emits events for expired ttl checks every 30s monitoring_overrides['check_every'] = '30s' monitoring_overrides['alert_after'] = '2m' monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting' monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, " "it may mean that the new version isn't healthy.") # Dogfooding this alert till I'm comfortable it doesn't spam people monitoring_overrides['team'] = 'noop' monitoring_overrides['notification_email'] = '*****@*****.**' monitoring_tools.send_event( service=service, check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance), overrides=monitoring_overrides, status=pysensu_yelp.Status.OK, output="The bounce is in a steady state", soa_dir=soa_dir, ttl=ttl, )
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir): """Send a Sensu event with a special ``ttl``, to let Sensu know that the everything is fine. This event is **not** fired when the bounce is in progress. If the bounce goes on for too long, this the ``ttl`` will expire and Sensu will emit a new event saying that this one didn't check in within the expected time-to-live.""" ttl = '1h' monitoring_overrides = marathon_tools.load_marathon_service_config( service=service, instance=instance, cluster=cluster, load_deployments=False, ).get_monitoring() # Sensu currently emits events for expired ttl checks every 30s monitoring_overrides['check_every'] = '30s' monitoring_overrides['alert_after'] = '2m' monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting' monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, " "it may mean that the new version isn't healthy.") # Dogfooding this alert till I'm comfortable it doesn't spam people monitoring_overrides['team'] = 'noop' monitoring_overrides['notification_email'] = '*****@*****.**' monitoring_tools.send_event( service=service, check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance), overrides=monitoring_overrides, status=pysensu_yelp.Status.OK, output="The bounce is in a steady state", soa_dir=soa_dir, ttl=ttl, )
def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config( service, namespace, cluster).get_monitoring() if 'alert_after' not in monitoring_overrides: monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line='Replication: %s' % output, component='monitoring', level='debug', cluster=cluster, instance=namespace, )
def perform_security_check(args): """It runs a few security tests, checks the return code and prints output that should help in fixing failures. If you are at Yelp, please visit https://confluence.yelpcorp.com/display/PAASTA/PaaSTA+security-check+explained to learn more. :param args: service - the name of the service; commit - upstream git commit. :return: 0 if the security-check passed, non-zero if it failed. """ security_check_command = load_system_paasta_config( ).get_security_check_command() if not security_check_command: paasta_print("Nothing to be executed during the security-check step") return 0 ret_code, output = _run(security_check_command, timeout=300, stream=True) if ret_code != 0: paasta_print( "The security-check failed. Please visit y/security-check-runbook to learn how to fix it (" "including whitelisting safe versions of packages).") sensu_status = pysensu_yelp.Status.CRITICAL if ret_code != 0 else pysensu_yelp.Status.OK send_event(service=args.service, check_name='%s.security_check' % args.service, overrides={ 'page': False, 'ticket': True }, status=sensu_status, output=output, soa_dir=DEFAULT_SOA_DIR) return ret_code
def send_event(service, instance, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param instance: The instance of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event """ cluster = load_system_paasta_config().get_cluster() monitoring_overrides = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).get_monitoring() # In order to let sensu know how often to expect this check to fire, # we need to set the ``check_every`` to the frequency of our cron job, which # is 10s. monitoring_overrides['check_every'] = '10s' # Most deploy_chronos_jobs failures are transient and represent issues # that will probably be fixed eventually, so we set an alert_after # to suppress extra noise monitoring_overrides['alert_after'] = '10m' check_name = 'setup_chronos_job.%s' % compose_job_id(service, instance) monitoring_tools.send_event( service=service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=soa_dir, )
def send_event(service, instance, monitoring_overrides, soa_dir, status_code, message): check_name = compose_check_name_for_service_instance('check_chronos_jobs', service, instance) monitoring_tools.send_event( service=service, check_name=check_name, overrides=monitoring_overrides, status=status_code, output=message, soa_dir=soa_dir, )
def test_send_event_sensu_host_is_None(self): fake_service = 'fake_service' fake_monitoring_overrides = {} fake_check_name = 'fake_check_name' fake_status = '42' fake_output = 'The http port is not open' fake_soa_dir = '/fake/soa/dir' self.fake_cluster = 'fake_cluster' fake_sensu_port = 12345 with mock.patch( "paasta_tools.monitoring_tools.get_team", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_tip", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_notification_email", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_irc_channels", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_ticket", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_project", autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_page", autospec=True, ), mock.patch( "pysensu_yelp.send_event", autospec=True, ) as pysensu_yelp_send_event_patch, mock.patch( 'paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True, ) as load_system_paasta_config_patch: load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock( return_value=None) load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock( return_value=fake_sensu_port) monitoring_tools.send_event( fake_service, fake_check_name, fake_monitoring_overrides, fake_status, fake_output, fake_soa_dir, ) assert pysensu_yelp_send_event_patch.call_count == 0
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def send_sensu_event(instance, oom_events, args): """ :param instance: InstanceConfig :param oom_events: a list of OOMEvents """ check_name = compose_check_name_for_service_instance( "oom-killer", instance.service, instance.instance) monitoring_overrides = instance.get_monitoring() status = compose_sensu_status( instance=instance, oom_events=oom_events, is_check_enabled=monitoring_overrides.get("check_oom_events", True), ) monitoring_overrides.update({ "page": False, "ticket": False, "alert_after": "0m", "realert_every": args.realert_every, "runbook": "y/check-oom-events", "tip": "Try bumping the memory limit past %dMB" % instance.get_mem(), }) return monitoring_tools.send_event( service=instance.service, check_name=check_name, overrides=monitoring_overrides, status=status[0], output=status[1], soa_dir=instance.soa_dir, )
def update_check_status(service, output, status): overrides = { "page": False, "alert_after": 0, "tip": "Ask the user to check the job UI and terminate the job if appropriate.", "runbook": "http://y/spark-debug", "ticket": True, } send_event( service=service, check_name=f"long_running_spark_jobs.{service}", overrides=overrides, status=status, output=output, soa_dir=DEFAULT_SOA_DIR, )
def update_check_status(service, output, status): overrides = { 'page': False, 'alert_after': 0, 'tip': 'Ask the user to check the job UI and terminate the job if appropriate.', 'runbook': 'http://y/spark-debug', 'ticket': True, } send_event( service=service, check_name=f'long_running_spark_jobs.{service}', overrides=overrides, status=status, output=output, soa_dir=DEFAULT_SOA_DIR, )
def send_sensu_event(instance, oom_events, args): """ :param instance: InstanceConfig :param oom_events: a list of OOMEvents """ check_name = compose_check_name_for_service_instance( 'oom-killer', instance.service, instance.instance, ) monitoring_overrides = instance.get_monitoring() status = compose_sensu_status( instance=instance, oom_events=oom_events, is_check_enabled=monitoring_overrides.get('check_oom_events', True), ) monitoring_overrides.update({ 'team': 'noop', # TODO: remove after testing 'page': False, 'ticket': False, 'alert_after': '0m', 'realert_every': args.realert_every, 'notification_email': False, # TODO: remove after testing 'irc_channels': ['#oom-test'], # TODO: remove after testing 'runbook': 'http://y/none', # TODO: needs a link 'tip': 'Try bumping the memory limit past %dMB' % instance.get_mem(), }) return monitoring_tools.send_event( service=instance.service, check_name=check_name, overrides=monitoring_overrides, status=status[0], output=status[1], soa_dir=instance.soa_dir, )
def send_sensu_event(instance, oom_events, args): """ :param instance: InstanceConfig :param oom_events: a list of OOMEvents """ check_name = compose_check_name_for_service_instance( 'oom-killer', instance.service, instance.instance, ) monitoring_overrides = instance.get_monitoring() status = compose_sensu_status( instance=instance, oom_events=oom_events, is_check_enabled=monitoring_overrides.get('check_oom_events', True), ) monitoring_overrides.update({ 'page': False, 'ticket': False, 'alert_after': '0m', 'realert_every': args.realert_every, 'runbook': 'y/check-oom-events', 'tip': 'Try bumping the memory limit past %dMB' % instance.get_mem(), }) return monitoring_tools.send_event( service=instance.service, check_name=check_name, overrides=monitoring_overrides, status=status[0], output=status[1], soa_dir=instance.soa_dir, )
def test_send_event_sensu_host_is_None(self): fake_service = 'fake_service' fake_monitoring_overrides = {} fake_check_name = 'fake_check_name' fake_status = '42' fake_output = 'The http port is not open' fake_soa_dir = '/fake/soa/dir' self.fake_cluster = 'fake_cluster' fake_sensu_port = 12345 with contextlib.nested( mock.patch("paasta_tools.monitoring_tools.get_team", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_tip", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_notification_email", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_irc_channels", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_ticket", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_project", autospec=True), mock.patch("paasta_tools.monitoring_tools.get_page", autospec=True), mock.patch("pysensu_yelp.send_event", autospec=True), mock.patch('paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True), ) as ( get_team_patch, get_tip_patch, get_notification_email_patch, get_irc_patch, get_ticket_patch, get_project_patch, get_page_patch, pysensu_yelp_send_event_patch, load_system_paasta_config_patch, ): load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(return_value=None) load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(return_value=fake_sensu_port) monitoring_tools.send_event( fake_service, fake_check_name, fake_monitoring_overrides, fake_status, fake_output, fake_soa_dir ) assert pysensu_yelp_send_event_patch.call_count == 0
def send_event(name, instance, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param name: The service name the event is about :param instance: The instance of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event """ cluster = load_system_paasta_config().get_cluster() monitoring_overrides = marathon_tools.load_marathon_service_config( name, instance, cluster, load_deployments=False ).get_monitoring() # In order to let sensu know how often to expect this check to fire, # we need to set the ``check_every`` to the frequency of our cron job, which # is 10s. monitoring_overrides["check_every"] = "10s" # Most setup_marathon_job failures are transient and represent issues # that will probably be fixed eventually, so we set an alert_after # to suppress extra noise monitoring_overrides["alert_after"] = "10m" check_name = "setup_marathon_job.%s" % compose_job_id(name, instance) monitoring_tools.send_event(name, check_name, monitoring_overrides, status, output, soa_dir)
def send_event(instance_config, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param instance_config: an instance of MarathonServiceConfig :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = instance_config.get_monitoring() if 'alert_after' not in monitoring_overrides: monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook( monitoring_overrides, instance_config.service, soa_dir=instance_config.soa_dir, ) check_name = ( 'check_marathon_services_replication.%s' % instance_config.job_id ) monitoring_tools.send_event( service=instance_config.service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, ) _log( service=instance_config.service, line='Replication: %s' % output, component='monitoring', level='debug', cluster=instance_config.cluster, instance=instance_config.instance, )
def send_sensu_event(instance, oom_events, args): """ :param instance: InstanceConfig :param oom_events: a list of OOMEvents """ check_name = compose_check_name_for_service_instance( "oom-killer", instance.service, instance.instance) monitoring_overrides = instance.get_monitoring() status = compose_sensu_status( instance=instance, oom_events=oom_events, is_check_enabled=monitoring_overrides.get("check_oom_events", True), alert_threshold=args.alert_threshold, check_interval=args.check_interval, ) if not status: return memory_limit = instance.get_mem() try: memory_limit_str = f"{int(memory_limit)}MB" except ValueError: memory_limit_str = memory_limit monitoring_overrides.update({ "page": False, "alert_after": "0m", "realert_every": args.realert_every, "runbook": "y/check-oom-events", "tip": ("Follow the runbook to investigate and rightsize memory usage " f"(curr: {memory_limit_str})"), }) return monitoring_tools.send_event( service=instance.service, check_name=check_name, overrides=monitoring_overrides, status=status[0], output=status[1], soa_dir=instance.soa_dir, dry_run=args.dry_run, )
def send_event(chronos_job_config, status_code, output): """Compose monitoring overrides and send the evernt to sensu. :param chronos_job_config: an instance of ChronosJobConfig :param status_code: Sensu status code :param output: An event message """ monitoring_overrides = compose_monitoring_overrides_for_service(chronos_job_config) return monitoring_tools.send_event( service=chronos_job_config.service, check_name=check_chronos_job_name( chronos_job_config.service, chronos_job_config.instance, ), overrides=monitoring_overrides, status=status_code, output=add_realert_status(output, monitoring_overrides.get('realert_every')), soa_dir=chronos_job_config.soa_dir, )
def test_send_event(self): fake_service = 'fake_service' fake_monitoring_overrides = {} fake_check_name = 'fake_check_name' fake_status = '42' fake_output = 'The http port is not open' fake_team = 'fake_team' fake_tip = 'fake_tip' fake_notification_email = 'fake@notify' fake_irc = '#fake' fake_soa_dir = '/fake/soa/dir' self.fake_cluster = 'fake_cluster' fake_sensu_host = 'fake_sensu_host' fake_sensu_port = 12345 expected_runbook = 'http://y/paasta-troubleshooting' expected_check_name = fake_check_name expected_kwargs = { 'tip': fake_tip, 'notification_email': fake_notification_email, 'irc_channels': fake_irc, 'project': None, 'ticket': False, 'page': True, 'alert_after': '5m', 'check_every': '1m', 'realert_every': -1, 'source': 'paasta-fake_cluster', 'ttl': None, } with contextlib.nested( mock.patch( "paasta_tools.monitoring_tools.get_team", return_value=fake_team, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_tip", return_value=fake_tip, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_notification_email", return_value=fake_notification_email, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_irc_channels", return_value=fake_irc, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_ticket", return_value=False, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_project", return_value=None, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_page", return_value=True, autospec=True, ), mock.patch("pysensu_yelp.send_event", autospec=True), mock.patch('paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True), ) as ( get_team_patch, get_tip_patch, get_notification_email_patch, get_irc_patch, get_ticket_patch, get_project_patch, get_page_patch, pysensu_yelp_send_event_patch, load_system_paasta_config_patch, ): load_system_paasta_config_patch.return_value.get_cluster = mock.Mock(return_value=self.fake_cluster) load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock(return_value=fake_sensu_host) load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock(return_value=fake_sensu_port) monitoring_tools.send_event( fake_service, fake_check_name, fake_monitoring_overrides, fake_status, fake_output, fake_soa_dir ) get_team_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir, ) get_tip_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_notification_email_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_irc_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_page_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) pysensu_yelp_send_event_patch.assert_called_once_with( expected_check_name, expected_runbook, fake_status, fake_output, fake_team, sensu_host=fake_sensu_host, sensu_port=fake_sensu_port, **expected_kwargs ) load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with()
def test_send_event(self): fake_service = 'fake_service' fake_monitoring_overrides = {} fake_check_name = 'fake_check_name' fake_status = '42' fake_output = 'The http port is not open' fake_team = 'fake_team' fake_tip = 'fake_tip' fake_notification_email = 'fake@notify' fake_irc = '#fake' fake_soa_dir = '/fake/soa/dir' self.fake_cluster = 'fake_cluster' fake_sensu_host = 'fake_sensu_host' fake_sensu_port = 12345 expected_runbook = 'http://y/paasta-troubleshooting' expected_check_name = fake_check_name expected_kwargs = { 'tip': fake_tip, 'notification_email': fake_notification_email, 'irc_channels': fake_irc, 'project': None, 'ticket': False, 'page': True, 'alert_after': '5m', 'check_every': '1m', 'realert_every': -1, 'source': 'paasta-fake_cluster', 'ttl': None, } with contextlib.nested( mock.patch( "paasta_tools.monitoring_tools.get_team", return_value=fake_team, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_tip", return_value=fake_tip, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_notification_email", return_value=fake_notification_email, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_irc_channels", return_value=fake_irc, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_ticket", return_value=False, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_project", return_value=None, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_page", return_value=True, autospec=True, ), mock.patch("pysensu_yelp.send_event", autospec=True), mock.patch( 'paasta_tools.monitoring_tools.load_system_paasta_config', autospec=True), ) as ( get_team_patch, get_tip_patch, get_notification_email_patch, get_irc_patch, get_ticket_patch, get_project_patch, get_page_patch, pysensu_yelp_send_event_patch, load_system_paasta_config_patch, ): load_system_paasta_config_patch.return_value.get_cluster = mock.Mock( return_value=self.fake_cluster) load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock( return_value=fake_sensu_host) load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock( return_value=fake_sensu_port) monitoring_tools.send_event(fake_service, fake_check_name, fake_monitoring_overrides, fake_status, fake_output, fake_soa_dir) get_team_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir, ) get_tip_patch.assert_called_once_with(fake_monitoring_overrides, fake_service, fake_soa_dir) get_notification_email_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir) get_irc_patch.assert_called_once_with(fake_monitoring_overrides, fake_service, fake_soa_dir) get_page_patch.assert_called_once_with(fake_monitoring_overrides, fake_service, fake_soa_dir) pysensu_yelp_send_event_patch.assert_called_once_with( expected_check_name, expected_runbook, fake_status, fake_output, fake_team, sensu_host=fake_sensu_host, sensu_port=fake_sensu_port, **expected_kwargs) load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with( )
def test_send_event(self): fake_service = "fake_service" fake_monitoring_overrides = {} fake_check_name = "fake_check_name" fake_status = "42" fake_output = "The http port is not open" fake_team = "fake_team" fake_tip = "fake_tip" fake_notification_email = "fake@notify" fake_irc = "#fake" fake_slack = "#fake_slack" fake_soa_dir = "/fake/soa/dir" self.fake_cluster = "fake_cluster" fake_sensu_host = "fake_sensu_host" fake_sensu_port = 12345 expected_runbook = "http://y/paasta-troubleshooting" expected_check_name = fake_check_name expected_kwargs = { "name": expected_check_name, "runbook": expected_runbook, "status": fake_status, "output": fake_output, "team": fake_team, "page": True, "tip": fake_tip, "notification_email": fake_notification_email, "check_every": "1m", "realert_every": -1, "alert_after": "5m", "irc_channels": fake_irc, "slack_channels": fake_slack, "ticket": False, "project": None, "priority": None, "source": "paasta-fake_cluster", "tags": [], "ttl": None, "sensu_host": fake_sensu_host, "sensu_port": fake_sensu_port, "component": None, "description": None, } with mock.patch( "paasta_tools.monitoring_tools.get_team", return_value=fake_team, autospec=True, ) as get_team_patch, mock.patch( "paasta_tools.monitoring_tools.get_tip", return_value=fake_tip, autospec=True, ) as get_tip_patch, mock.patch( "paasta_tools.monitoring_tools.get_notification_email", return_value=fake_notification_email, autospec=True, ) as get_notification_email_patch, mock.patch( "paasta_tools.monitoring_tools.get_irc_channels", return_value=fake_irc, autospec=True, ) as get_irc_patch, mock.patch( "paasta_tools.monitoring_tools.get_slack_channels", return_value=fake_slack, autospec=True, ) as get_slack_patch, mock.patch( "paasta_tools.monitoring_tools.get_ticket", return_value=False, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_project", return_value=None, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_page", return_value=True, autospec=True ) as get_page_patch, mock.patch( "paasta_tools.monitoring_tools.get_priority", return_value=None, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_tags", return_value=[], autospec=True ), mock.patch( "paasta_tools.monitoring_tools.get_component", return_value=None, autospec=True, ), mock.patch( "paasta_tools.monitoring_tools.get_description", return_value=None, autospec=True, ), mock.patch( "pysensu_yelp.send_event", autospec=True ) as pysensu_yelp_send_event_patch, mock.patch( "paasta_tools.monitoring_tools.load_system_paasta_config", autospec=True ) as load_system_paasta_config_patch: load_system_paasta_config_patch.return_value.get_cluster = mock.Mock( return_value=self.fake_cluster ) load_system_paasta_config_patch.return_value.get_sensu_host = mock.Mock( return_value=fake_sensu_host ) load_system_paasta_config_patch.return_value.get_sensu_port = mock.Mock( return_value=fake_sensu_port ) monitoring_tools.send_event( fake_service, fake_check_name, fake_monitoring_overrides, fake_status, fake_output, fake_soa_dir, ) get_team_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_tip_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_notification_email_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_irc_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_slack_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) get_page_patch.assert_called_once_with( fake_monitoring_overrides, fake_service, fake_soa_dir ) pysensu_yelp_send_event_patch.assert_called_once_with(**expected_kwargs) load_system_paasta_config_patch.return_value.get_cluster.assert_called_once_with()
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() running_jobs = set(deployed_job_names(client)) expected_service_jobs = {chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)} all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) monitoring_tools.send_event( check_name=check_chronos_job_name(service, instance), service=service, overrides={}, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, output="This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: paasta_print('No Chronos Jobs to remove') else: if len(task_successes) > 0: paasta_print(format_list_output( "Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes], )) # if there are any failures, print and exit appropriately if len(task_failures) > 0: paasta_print(format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])) if len(job_successes) > 0: paasta_print(format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])) # if there are any failures, print and exit appropriately if len(job_failures) > 0: paasta_print(format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)