def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config( service=service, instance=namespace, cluster=cluster, soa_dir=soa_dir, load_deployments=False, ).get_monitoring() if 'alert_after' not in monitoring_overrides: monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line='Replication: %s' % output, component='monitoring', level='debug', cluster=cluster, instance=namespace, )
def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config( service, namespace, cluster).get_monitoring() if 'alert_after' not in monitoring_overrides: monitoring_overrides['alert_after'] = '2m' monitoring_overrides['check_every'] = '1m' monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line='Replication: %s' % output, component='monitoring', level='debug', cluster=cluster, instance=namespace, )
def get_desired_marathon_configs(soa_dir): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster( instance_type='marathon', cluster=cluster, soa_dir=soa_dir, ) marathon_configs = dict() for service, instance in instances: try: marathon_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ).format_marathon_app_dict() marathon_configs[marathon_config['id'].lstrip( '/')] = marathon_config except NoSlavesAvailableError as errormsg: _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) except (NoDeploymentsAvailable, NoDockerImageError): pass return marathon_configs
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper( marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def alert_state_change(application: Application, soa_dir: str) -> Generator: service = application.kube_deployment.service instance = application.kube_deployment.instance cluster = load_system_paasta_config().get_cluster() try: yield log_line = ("Deleted stale Kubernetes apps that looks lost: %s" % application.item.metadata.name) _log( service=service, component="deploy", level="event", cluster=cluster, instance=instance, line=log_line, ) except Exception: loglines = [ "Exception raised during cleanup of service %s:" % application ] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component="deploy", level="debug", cluster=cluster, instance=instance, line=logline, ) raise
def bounce_chronos_job(service, instance, cluster, jobs_to_disable, jobs_to_delete, job_to_create, client): if any([jobs_to_disable, jobs_to_delete, job_to_create]): log_line = "Chronos bouncing. Jobs to disable: %s, jobs to delete: %s, job_to_create: %s" % ( jobs_to_disable, jobs_to_delete, job_to_create) _log(service=service, instance=instance, component='deploy', cluster=cluster, level='debug', line=log_line) else: log.debug("Not doing any chronos bounce action for %s" % chronos_tools.compose_job_id(service, instance)) for job in jobs_to_disable: chronos_tools.disable_job(client=client, job=job) for job in jobs_to_delete: chronos_tools.delete_job(client=client, job=job) if job_to_create: chronos_tools.create_job(client=client, job=job_to_create) log_line = 'Created new Chronos job: %s' % job_to_create['name'] _log(service=service, instance=instance, component='deploy', cluster=cluster, level='event', line=log_line) return (0, "All chronos bouncing tasks finished.")
def paasta_itest(args): """Build and test a docker image""" service = args.service if service and service.startswith("services-"): service = service.split("services-", 1)[1] validate_service_name(service) tag = build_docker_tag(service, args.commit) run_env = os.environ.copy() run_env["DOCKER_TAG"] = tag cmd = "make itest" loglines = [] _log(service=service, line="starting itest for %s." % args.commit, component="build", level="event") returncode, output = _run( cmd, env=run_env, timeout=3600, log=True, component="build", service=service, loglevel="debug" ) if returncode != 0: loglines.append("ERROR: itest failed for %s." % args.commit) output = get_jenkins_build_output_url() if output: loglines.append("See output: %s" % output) else: loglines.append("itest passed for %s." % args.commit) if not check_docker_image(service, args.commit): loglines.append("ERROR: itest has not created %s" % tag) returncode = 1 for logline in loglines: _log(service=service, line=logline, component="build", level="event") sys.exit(returncode)
def send_event(service, namespace, cluster, soa_dir, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param service: The service name the event is about :param namespace: The namespace of the service the event is about :param soa_dir: The service directory to read monitoring information from :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = marathon_tools.load_marathon_service_config(service, namespace, cluster).get_monitoring() if "alert_after" not in monitoring_overrides: monitoring_overrides["alert_after"] = "2m" monitoring_overrides["check_every"] = "1m" monitoring_overrides["runbook"] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir) check_name = "check_marathon_services_replication.%s" % compose_job_id(service, namespace) monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir) _log( service=service, line="Replication: %s" % output, component="monitoring", level="debug", cluster=cluster, instance=namespace, )
def mark_for_deployment(git_url, cluster, instance, service, commit): """Mark a docker image for deployment""" cmd = build_command(git_url, commit, cluster=cluster, instance=instance) # Clusterinstance should be in cluster.instance format returncode, output = _run( cmd, timeout=30, ) loglines = get_loglines( returncode=returncode, cmd=cmd, output=output, commit=commit, cluster=cluster, instance=instance ) for logline in loglines: _log( service=service, line=logline, component='deploy', level='event', cluster=cluster, instance=instance, ) return returncode
def bounce_chronos_job( service, instance, cluster, jobs_to_disable, jobs_to_delete, job_to_create, client ): if any([jobs_to_disable, jobs_to_delete, job_to_create]): log_line = "Chronos bouncing. Jobs to disable: %s, jobs to delete: %s, job_to_create: %s" % ( jobs_to_disable, jobs_to_delete, job_to_create) _log(service=service, instance=instance, component='deploy', cluster=cluster, level='debug', line=log_line) else: log.debug("Not doing any chronos bounce action for %s" % chronos_tools.compose_job_id( service, instance)) for job in jobs_to_disable: chronos_tools.disable_job(client=client, job=job) for job in jobs_to_delete: chronos_tools.delete_job(client=client, job=job) if job_to_create: chronos_tools.create_job(client=client, job=job_to_create) log_line = 'Created new Chronos job: %s' % job_to_create['name'] _log(service=service, instance=instance, component='deploy', cluster=cluster, level='event', line=log_line) return (0, "All chronos bouncing tasks finished.")
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def start_chronos_job(service, instance, job_id, client, cluster, job_config, complete_job_config, emergency=False): """ Calls the 'manual start' Chronos endpoint (https://mesos.github.io/chronos/docs/api.html#manually-starting-a-job), running the job now regardless of its 'schedule'. The job's "schedule" is unmodified. If a job is disabled, this function does not do anything. """ name = PaastaColors.cyan(job_id) # The job should be run immediately as long as the job is not disabled via the 'disabled' key in soa-configs or has # been previously stopped. if complete_job_config['disabled']: paasta_print( PaastaColors.red( "You cannot emergency start a disabled job. Run `paasta start` first." )) else: log_reason = PaastaColors.red( "EmergencyStart") if emergency else "Brutal bounce" _log(service=service, line="%s: Starting manual run of %s in Chronos" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance) client.update(complete_job_config) client.run(job_id)
def mark_for_deployment(git_url, deploy_group, service, commit): """Mark a docker image for deployment""" tag = get_paasta_tag_from_deploy_group(identifier=deploy_group, desired_state='deploy') remote_tag = format_tag(tag) ref_mutator = remote_git.make_force_push_mutate_refs_func( targets=[remote_tag], sha=commit, ) try: remote_git.create_remote_refs(git_url=git_url, ref_mutator=ref_mutator, force=True) except Exception as e: loglines = ["Failed to mark %s for deployment in deploy group %s!" % (commit, deploy_group)] for line in str(e).split('\n'): loglines.append(line) return_code = 1 else: loglines = ["Marked %s for deployment in deploy group %s" % (commit, deploy_group)] return_code = 0 for logline in loglines: _log( service=service, line=logline, component='deploy', level='event', ) return return_code
def bounce_chronos_job( service, instance, cluster, job_to_update, client, ): if job_to_update: log_line = 'Job to update: %s' % job_to_update _log( service=service, instance=instance, component='deploy', cluster=cluster, level='debug', line=log_line, ) chronos_tools.update_job(client=client, job=job_to_update) log_line = 'Updated Chronos job: %s' % job_to_update['name'] _log( service=service, instance=instance, component='deploy', cluster=cluster, level='event', line=log_line, ) return (0, "All chronos bouncing tasks finished.")
def on_enter_rolled_back(self): self.update_slack_status( f"Finished rolling back to `{self.old_git_sha[:8]}` in {self.deploy_group}" ) line = f"Rollback to {self.old_git_sha[:8]} for {self.deploy_group} complete" _log(service=self.service, component="deploy", line=line, level="event") self.start_timer(self.auto_abandon_delay, "auto_abandon", "abandon")
def mark_for_deployment(git_url, deploy_group, service, commit): """Mark a docker image for deployment""" remote_branch = get_paasta_branch_from_deploy_group(identifier=deploy_group) ref_mutator = remote_git.make_force_push_mutate_refs_func( target_branches=[remote_branch], sha=commit, ) try: remote_git.create_remote_refs(git_url=git_url, ref_mutator=ref_mutator, force=True) except Exception as e: loglines = ["Failed to mark %s in for deployment in deploy group %s!" % (commit, deploy_group)] for line in str(e).split('\n'): loglines.append(line) return_code = 1 else: loglines = ["Marked %s in for deployment in deploy group %s" % (commit, deploy_group)] return_code = 0 for logline in loglines: _log( service=service, line=logline, component='deploy', level='event', ) return return_code
def get_desired_marathon_configs(soa_dir): cluster = load_system_paasta_config().get_cluster() instances = get_services_for_cluster(instance_type="marathon", cluster=cluster, soa_dir=soa_dir) job_configs = dict() formatted_marathon_configs = dict() for service, instance in instances: try: job_config = load_marathon_service_config(service=service, instance=instance, cluster=cluster, soa_dir=soa_dir) formatted_config = job_config.format_marathon_app_dict() formatted_marathon_configs[formatted_config["id"].lstrip( "/")] = formatted_config job_configs[formatted_config["id"].lstrip("/")] = job_config # Not ideal but we rely on a lot of user input to create the app dict # and we really can't afford to bail if just one app definition is malformed except Exception as errormsg: _log( service=service, line=str(errormsg), component="deploy", level="debug", cluster=cluster, instance=instance, ) return formatted_marathon_configs, job_configs
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): cluster_map = get_cluster_instance_map_for_service( soa_dir=soa_dir, service=service, deploy_group=deploy_group) if not cluster_map: _log(service=service, component='deploy', line=("Couldn't find any instances for service {0} in deploy " "group {1}".format(service, deploy_group)), level='event') raise NoInstancesFound paasta_print("Waiting for deployment of {0} for '{1}' complete...".format( git_sha, deploy_group)) total_instances = 0 clusters_data = [] for cluster in cluster_map: clusters_data.append( ClusterData(cluster=cluster, service=service, git_sha=git_sha, instances_queue=Queue())) for i in cluster_map[cluster]['instances']: clusters_data[-1].instances_queue.put(i) total_instances += len(cluster_map[cluster]['instances']) deadline = time.time() + timeout green_light = Event() green_light.set() with progressbar.ProgressBar(maxval=total_instances) as bar: while time.time() < deadline: _query_clusters(clusters_data, green_light) if not green_light.is_set(): raise KeyboardInterrupt bar.update(total_instances - sum((c.instances_queue.qsize() for c in clusters_data))) if all((cluster.instances_queue.empty() for cluster in clusters_data)): sys.stdout.flush() return 0 else: time.sleep(min(10, timeout)) sys.stdout.flush() _log(service=service, component='deploy', line=( "\n\nTimed out after {0} seconds, waiting for {2} in {1} to be " "deployed by PaaSTA. \n\n" "This probably means the deploy hasn't suceeded. The new service " "might not be healthy or one or more clusters could be having " "issues.\n\n" "To debug: try running:\n\n paasta status -s {2} -vv\n" " paasta logs -s {2}\n\nto determine the cause.\n\n" "If the service is known to be slow to start you may wish to " "increase the timeout on this step.".format( timeout, deploy_group, service)), level='event') raise TimeoutError
def log_event(service_config, desired_state): user = utils.get_username() host = socket.getfqdn() line = "Issued request to change state of {} (an instance of {}) to '{}' by {}@{}".format( service_config.get_instance(), service_config.get_service(), desired_state, user, host, ) utils._log( service=service_config.get_service(), level="event", cluster=service_config.get_cluster(), instance=service_config.get_instance(), component="deploy", line=line, ) utils._log_audit( action=desired_state, service=service_config.get_service(), cluster=service_config.get_cluster(), instance=service_config.get_instance(), )
def mark_for_deployment(git_url, cluster, instance, service, commit): """Mark a docker image for deployment""" remote_branch = get_paasta_branch(cluster=cluster, instance=instance) ref_mutator = remote_git.make_force_push_mutate_refs_func( target_branches=[remote_branch], sha=commit, ) try: remote_git.create_remote_refs(git_url=git_url, ref_mutator=ref_mutator, force=True) except Exception as e: loglines = ["Failed to mark %s in for deployment on %s in the %s cluster!" % (commit, instance, cluster)] for line in str(e).split('\n'): loglines.append(line) return_code = 1 else: loglines = ["Marked %s in for deployment on %s in the %s cluster" % (commit, instance, cluster)] return_code = 0 for logline in loglines: _log( service=service, line=logline, component='deploy', level='event', cluster=cluster, instance=instance, ) return return_code
def send_replication_event(instance_config, status, output): """Send an event to sensu via pysensu_yelp with the given information. :param instance_config: an instance of LongRunningServiceConfig :param status: The status to emit for this event :param output: The output to emit for this event""" # This function assumes the input is a string like "mumble.main" monitoring_overrides = instance_config.get_monitoring() if "alert_after" not in monitoring_overrides: monitoring_overrides["alert_after"] = "2m" monitoring_overrides["check_every"] = "1m" monitoring_overrides["runbook"] = get_runbook( monitoring_overrides, instance_config.service, soa_dir=instance_config.soa_dir) check_name = "check_paasta_services_replication.%s" % instance_config.job_id send_event( service=instance_config.service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, ) _log( service=instance_config.service, line="Replication: %s" % output, component="monitoring", level="debug", cluster=instance_config.cluster, instance=instance_config.instance, )
def paasta_push_to_registry(args): """Upload a docker image to a registry""" service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] validate_service_name(service) cmd = build_command(service, args.commit) loglines = [] returncode, output = _run( cmd, timeout=3600, log=True, component='build', service=service, loglevel='debug' ) if returncode != 0: loglines.append('ERROR: Failed to promote image for %s.' % args.commit) output = get_jenkins_build_output_url() if output: loglines.append('See output: %s' % output) else: loglines.append('Successfully pushed image for %s to registry' % args.commit) for logline in loglines: _log( service=service, line=logline, component='build', level='event', ) return returncode
def start_chronos_job(service, instance, job_id, client, cluster, job_config, complete_job_config, emergency=False): """ Calls the 'manual start' Chronos endpoint (https://mesos.github.io/chronos/docs/api.html#manually-starting-a-job), running the job now regardless of its 'schedule'. The job's "schedule" is unmodified. If a job is disabled, this function does not do anything. """ name = PaastaColors.cyan(job_id) # The job should be run immediately as long as the job is not disabled via the 'disabled' key in soa-configs or has # been previously stopped. if complete_job_config['disabled']: print PaastaColors.red("You cannot emergency start a disabled job. Run `paasta start` first.") else: log_reason = PaastaColors.red("EmergencyStart") if emergency else "Brutal bounce" _log( service=service, line="%s: Starting manual run of %s in Chronos" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance ) client.update(complete_job_config) client.run(job_id)
def paasta_push_to_registry(args): """Upload a docker image to a registry""" service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] validate_service_name(service, args.soa_dir) cmd = build_command(service, args.commit) loglines = [] returncode, output = _run(cmd, timeout=3600, log=True, stream=True, component='build', service=service, loglevel='debug') if returncode != 0: loglines.append('ERROR: Failed to promote image for %s.' % args.commit) output = get_jenkins_build_output_url() if output: loglines.append('See output: %s' % output) else: loglines.append('Successfully pushed image for %s to registry' % args.commit) for logline in loglines: _log( service=service, line=logline, component='build', level='event', ) return returncode
def mark_for_deployment(git_url, deploy_group, service, commit): """Mark a docker image for deployment""" tag = get_paasta_tag_from_deploy_group( identifier=deploy_group, desired_state="deploy" ) remote_tag = format_tag(tag) ref_mutator = remote_git.make_force_push_mutate_refs_func( targets=[remote_tag], sha=commit ) max_attempts = 3 for attempt in range(1, max_attempts + 1): try: remote_git.create_remote_refs( git_url=git_url, ref_mutator=ref_mutator, force=True ) except Exception: logline = "Failed to mark {} for deployment in deploy group {}! (attempt {}/{})".format( commit, deploy_group, attempt, max_attempts ) _log(service=service, line=logline, component="deploy", level="event") time.sleep(5 * attempt) else: logline = f"Marked {commit} for deployment in deploy group {deploy_group}" _log(service=service, line=logline, component="deploy", level="event") audit_action_details = {"deploy_group": deploy_group, "commit": commit} _log_audit( action="mark-for-deployment", action_details=audit_action_details, service=service, ) return 0 return 1
def paasta_wait_for_deployment(args): """Wrapping wait_for_deployment""" if args.verbose: log.setLevel(level=logging.DEBUG) else: log.setLevel(level=logging.INFO) service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] if args.git_url is None: args.git_url = get_git_url(service=service, soa_dir=args.soa_dir) try: validate_full_git_sha(args.commit) except ArgumentTypeError: refs = remote_git.list_remote_refs(args.git_url) commits = short_to_full_git_sha(short=args.commit, refs=refs) if len(commits) != 1: raise ValueError( "%s matched %d git shas (with refs pointing at them). Must match exactly 1." % (args.commit, len(commits)), ) args.commit = commits[0] try: validate_service_name(service, soa_dir=args.soa_dir) validate_deploy_group(args.deploy_group, service, args.soa_dir) validate_git_sha( args.commit, args.git_url, args.deploy_group, service, ) except (GitShaError, DeployGroupError, NoSuchService) as e: paasta_print(PaastaColors.red('{}'.format(e))) return 1 try: wait_for_deployment( service=service, deploy_group=args.deploy_group, git_sha=args.commit, soa_dir=args.soa_dir, timeout=args.timeout, ) _log( service=service, component='deploy', line=("Deployment of {} for {} complete".format( args.commit, args.deploy_group)), level='event', ) except (KeyboardInterrupt, TimeoutError, NoSuchCluster): report_waiting_aborted(service, args.deploy_group) return 1 return 0
def log(self, line, level=DEFAULT_LOGLEVEL): _log( service=self.service_name, instance=self.instance_name, component='deploy', line=line, level=level, )
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='paasta_bounce_progress.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def on_enter_deployed(self): line = f"Deployment of {self.commit} for {self.deploy_group} complete" _log( service=self.service, component='deploy', line=line, level='event', ) self.slack_notifier.notify_after_good_deploy()
def write_to_log(config, line, level='event'): _log( service=config.service, line="%s: %s" % (format_job_id(config.service, config.instance), line), component='deploy', level=level, cluster=config.cluster, instance=config.instance, )
def trigger_deploys(service): """Connects to the deploymentsd watcher on sysgit, which is an extremely simple service that listens for a service string and then generates a service deployment""" logline = f"Notifying sysgit to generate a deployment for {service}" _log(service=service, line=logline, component="deploy", level="event") client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client.connect(("sysgit.yelpcorp.com", 5049)) client.send(f"{service}\n".encode("utf-8")) client.close()
def write_to_log(config, line, level='event'): _log( service=config.service, line=line, component='deploy', level=level, cluster=config.cluster, instance=config.instance, )
def write_to_log(config, line, level="event"): _log( service=config.service, line=line, component="deploy", level=level, cluster=config.cluster, instance=config.instance, )
def send_replication_event( instance_config, status, output, description, dry_run=False, ): """Send an event to sensu via pysensu_yelp with the given information. :param instance_config: an instance of LongRunningServiceConfig :param status: The status to emit for this event :param output: The output to emit for this event :param dry_run: Print the event instead of emitting it """ # This function assumes the input is a string like "mumble.main" monitoring_overrides = instance_config.get_monitoring() if "alert_after" not in monitoring_overrides: monitoring_overrides["alert_after"] = "2m" monitoring_overrides["check_every"] = "1m" monitoring_overrides["runbook"] = __get_monitoring_config_value( "runbook", monitoring_overrides, instance_config.service, soa_dir=instance_config.soa_dir, monitoring_defaults=lambda _: DEFAULT_REPLICATION_RUNBOOK, ) monitoring_overrides["tip"] = __get_monitoring_config_value( "tip", monitoring_overrides, instance_config.service, soa_dir=instance_config.soa_dir, monitoring_defaults=lambda _: ( f"Check the instance with: `paasta status -s {instance_config.service} " f"-i {instance_config.instance} -c {instance_config.cluster} -vv`" ), ) monitoring_overrides["description"] = description check_name = "check_paasta_services_replication.%s" % instance_config.job_id send_event( service=instance_config.service, check_name=check_name, overrides=monitoring_overrides, status=status, output=output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, dry_run=dry_run, ) _log( service=instance_config.service, line="Replication: %s" % output, component="monitoring", level="debug", cluster=instance_config.cluster, instance=instance_config.instance, )
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): cluster_map = get_cluster_instance_map_for_service(soa_dir, service, deploy_group) if not cluster_map: line = "Couldn't find any instances for service {0} in deploy group {1}".format( service, deploy_group) _log(service=service, component='deploy', line=line, level='event') raise NoInstancesFound paasta_print("Waiting for deployment of {0} for '{1}' complete...".format( git_sha, deploy_group)) for cluster in cluster_map.values(): cluster['deployed'] = 0 try: with Timeout(seconds=timeout): total_instances = sum( [len(v["instances"]) for v in cluster_map.values()]) with progressbar.ProgressBar(maxval=total_instances) as bar: while True: for cluster, instances in cluster_map.items(): if cluster_map[cluster]['deployed'] != len( cluster_map[cluster]['instances']): cluster_map[cluster][ 'deployed'] = instances_deployed( cluster=cluster, service=service, instances=instances['instances'], git_sha=git_sha) if cluster_map[cluster]['deployed'] == len( cluster_map[cluster]['instances']): instance_csv = ", ".join( cluster_map[cluster]['instances']) paasta_print( "Deploy to %s complete! (instances: %s)" % (cluster, instance_csv)) bar.update( sum([v["deployed"] for v in cluster_map.values()])) if all([ cluster['deployed'] == len(cluster["instances"]) for cluster in cluster_map.values() ]): break else: time.sleep(10) except TimeoutError: line = "\n\nTimed out after {0} seconds, waiting for {2} in {1} to be deployed by PaaSTA. \n\n"\ "This probably means the deploy hasn't suceeded. The new service might not be healthy or one "\ "or more clusters could be having issues.\n\n"\ "To debug: try running:\n\n"\ " paasta status -s {2} -vv\n"\ " paasta logs -s {2}\n\n"\ "to determine the cause.\n\n"\ "If the service is known to be slow to start you may wish to increase "\ "the timeout on this step.".format(timeout, deploy_group, service) _log(service=service, component='deploy', line=line, level='event') raise return True
def scale_marathon_job(service, instance, app_id, delta, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyScale: Scaling %s %s by %d instances" % (name, 'down' if delta < 0 else 'up', abs(int(delta))), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app(app_id, delta=int(delta), force=True)
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): cluster_map = get_cluster_instance_map_for_service(soa_dir, service, deploy_group) if not cluster_map: line = "Couldn't find any instances for service {0} in deploy group {1}".format(service, deploy_group) _log( service=service, component='deploy', line=line, level='event' ) raise NoInstancesFound paasta_print("Waiting for deployment of {0} for '{1}' complete..." .format(git_sha, deploy_group)) for cluster in cluster_map.values(): cluster['deployed'] = 0 try: with Timeout(seconds=timeout): total_instances = sum([len(v["instances"]) for v in cluster_map.values()]) with progressbar.ProgressBar(maxval=total_instances) as bar: while True: for cluster, instances in cluster_map.items(): if cluster_map[cluster]['deployed'] != len(cluster_map[cluster]['instances']): cluster_map[cluster]['deployed'] = instances_deployed( cluster=cluster, service=service, instances=instances['instances'], git_sha=git_sha) if cluster_map[cluster]['deployed'] == len(cluster_map[cluster]['instances']): instance_csv = ", ".join(cluster_map[cluster]['instances']) paasta_print("Deploy to %s complete! (instances: %s)" % (cluster, instance_csv)) bar.update(sum([v["deployed"] for v in cluster_map.values()])) if all([cluster['deployed'] == len(cluster["instances"]) for cluster in cluster_map.values()]): sys.stdout.flush() break else: time.sleep(10) sys.stdout.flush() except TimeoutError: line = "\n\nTimed out after {0} seconds, waiting for {2} in {1} to be deployed by PaaSTA. \n\n"\ "This probably means the deploy hasn't suceeded. The new service might not be healthy or one "\ "or more clusters could be having issues.\n\n"\ "To debug: try running:\n\n"\ " paasta status -s {2} -vv\n"\ " paasta logs -s {2}\n\n"\ "to determine the cause.\n\n"\ "If the service is known to be slow to start you may wish to increase "\ "the timeout on this step.".format(timeout, deploy_group, service) _log( service=service, component='deploy', line=line, level='event' ) raise return True
def restart_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyRestart: Scaling %s down to 0 instances, then letting them scale back up" % (name), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=0, force=True)
def log_to_paasta(log_line): """Add the event to the standard PaaSTA logging backend.""" line = ('oom-killer killed %s on %s (container_id: %s).' % ( 'a %s process' % log_line.process_name if log_line.process_name else 'a process', log_line.hostname, log_line.container_id, )) _log( service=log_line.service, instance=log_line.instance, component='oom', cluster=log_line.cluster, level=DEFAULT_LOGLEVEL, line=line, )
def start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyStart: scaling %s up to %d instances" % (name, normal_instance_count), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=normal_instance_count, force=True)
def stop_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyStop: Scaling %s down to 0 instances" % (name), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, instances=0, force=True) # TODO do we want to capture the return val of any client calls?
def scale_marathon_job(service, instance, app_id, delta, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log( service=service, line="EmergencyScale: Scaling %s %s by %d instances" % (name, 'down' if delta < 0 else 'up', abs(int(delta))), component='deploy', level='event', cluster=cluster, instance=instance ) client.scale_app(app_id, delta=int(delta), force=True)
def stop_marathon_job(service, instance, app_id, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyStop: Scaling %s down to 0 instances" % (name), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app( app_id, instances=0, force=True ) # TODO do we want to capture the return val of any client calls?
def start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster): name = PaastaColors.cyan(compose_job_id(service, instance)) _log(service=service, line="EmergencyStart: scaling %s up to %d instances" % (name, normal_instance_count), component='deploy', level='event', cluster=cluster, instance=instance) client.scale_app(app_id, instances=normal_instance_count, force=True)
def log_event(service_config, desired_state): user = utils.get_username() host = socket.getfqdn() line = "Issued request to change state of %s to '%s' by %s@%s" % ( service_config.get_instance(), desired_state, user, host) utils._log( service=service_config.get_service(), level='event', cluster=service_config.get_cluster(), instance=service_config.get_instance(), component='deploy', line=line, )
def paasta_itest(args): """Build and test a docker image""" service = args.service soa_dir = args.soa_dir if service and service.startswith('services-'): service = service.split('services-', 1)[1] validate_service_name(service, soa_dir=soa_dir) tag = build_docker_tag(service, args.commit) run_env = os.environ.copy() run_env['DOCKER_TAG'] = tag cmd = "make itest" loglines = [] _log( service=service, line='starting itest for %s.' % args.commit, component='build', level='event' ) returncode, output = _run( cmd, env=run_env, timeout=3600, log=True, component='build', service=service, loglevel='debug', stream=True, ) if returncode != 0: loglines.append( 'ERROR: itest failed for %s.' % args.commit ) output = get_jenkins_build_output_url() if output: loglines.append('See output: %s' % output) else: loglines.append('itest passed for %s.' % args.commit) if not check_docker_image(service, args.commit): loglines.append('ERROR: itest has not created %s' % tag) returncode = 1 for logline in loglines: _log( service=service, line=logline, component='build', level='event', ) return returncode
def stop_chronos_job(service, instance, client, cluster, existing_jobs, emergency=False): log_reason = PaastaColors.red("EmergencyStop") if emergency else "Brutal bounce" for job in existing_jobs: name = PaastaColors.cyan(job["name"]) _log( service=service, line="%s: Killing all tasks for job %s" % (log_reason, name), component="deploy", level="event", cluster=cluster, instance=instance ) job["disabled"] = True client.update(job) client.delete_tasks(job["name"])
def start_chronos_job(service, instance, job_id, client, cluster, job_config, emergency=False): name = PaastaColors.cyan(job_id) log_reason = PaastaColors.red("EmergencyStart") if emergency else "Brutal bounce" log_immediate_run = " and running it immediately" if not job_config["disabled"] else "" _log( service=service, line="%s: Sending job %s to Chronos%s" % (log_reason, name, log_immediate_run), component="deploy", level="event", cluster=cluster, instance=instance ) client.update(job_config) # TODO fail or give some output/feedback to user that the job won't run immediately if disabled (PAASTA-1244) if not job_config["disabled"]: client.run(job_id)
def bounce_chronos_job( service, instance, cluster, job_to_update, client ): if job_to_update: log_line = 'Job to update: %s' % job_to_update _log(service=service, instance=instance, component='deploy', cluster=cluster, level='debug', line=log_line) chronos_tools.update_job(client=client, job=job_to_update) log_line = 'Updated Chronos job: %s' % job_to_update['name'] _log(service=service, instance=instance, component='deploy', cluster=cluster, level='event', line=log_line) return (0, "All chronos bouncing tasks finished.")
def log_bounce_action(line, level='debug'): return _log( service=service, line=line, component='deploy', level=level, cluster=cluster, instance=instance )
def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance )
def paasta_wait_for_deployment(args): """Wrapping wait_for_deployment""" if args.verbose: log.setLevel(level=logging.DEBUG) else: log.setLevel(level=logging.INFO) service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] if args.git_url is None: args.git_url = get_git_url(service=service, soa_dir=args.soa_dir) try: validate_service_name(service, soa_dir=args.soa_dir) validate_deploy_group(args.deploy_group, service, args.soa_dir) validate_git_sha(args.commit, args.git_url, args.deploy_group, service) except (GitShaError, DeployGroupError, NoSuchService) as e: paasta_print(PaastaColors.red('{}'.format(e))) return 1 try: wait_for_deployment( service=service, deploy_group=args.deploy_group, git_sha=args.commit, soa_dir=args.soa_dir, timeout=args.timeout) _log( service=service, component='deploy', line=("Deployment of {0} for {1} complete".format( args.commit, args.deploy_group)), level='event') except (KeyboardInterrupt, TimeoutError): paasta_print("Waiting for deployment aborted.") return 1 except NoInstancesFound: return 1 return 0
def paasta_cook_image(args, service=None, soa_dir=None): """Build a docker image""" if service: service = service else: service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] validate_service_name(service, soa_dir) run_env = os.environ.copy() default_tag = 'paasta-cook-image-%s-%s' % (service, get_username()) tag = run_env.get('DOCKER_TAG', default_tag) run_env['DOCKER_TAG'] = tag if not makefile_responds_to('cook-image'): sys.stderr.write('ERROR: local-run now requires a cook-image target to be present in the Makefile. See ' 'http://paasta.readthedocs.io/en/latest/about/contract.html\n') return 1 try: cmd = 'make cook-image' returncode, output = _run( cmd, env=run_env, log=True, component='build', service=service, loglevel='debug' ) if returncode != 0: _log( service=service, line='ERROR: make cook-image failed for %s.' % service, component='build', level='event', ) return returncode except KeyboardInterrupt: sys.stderr.write('\nProcess interrupted by the user. Cancelling.\n') return 2
def paasta_push_to_registry(args): """Upload a docker image to a registry""" service = args.service if service and service.startswith("services-"): service = service.split("services-", 1)[1] validate_service_name(service) cmd = build_command(service, args.commit) loglines = [] returncode, output = _run(cmd, timeout=3600, log=True, component="build", service=service, loglevel="debug") if returncode != 0: loglines.append("ERROR: Failed to promote image for %s." % args.commit) output = get_jenkins_build_output_url() if output: loglines.append("See output: %s" % output) else: loglines.append("Successfully pushed image for %s to registry" % args.commit) for logline in loglines: _log(service=service, line=logline, component="build", level="event") sys.exit(returncode)
def wait_for_deployment(service, deploy_group, git_sha, soa_dir, timeout): cluster_map = get_cluster_instance_map_for_service(soa_dir, service, deploy_group) if not cluster_map: line = "Couldn't find any instances for service {0} in deploy group {1}".format(service, deploy_group) _log( service=service, component='deploy', line=line, level='event' ) raise NoInstancesFound for cluster in cluster_map.values(): cluster['deployed'] = 0 try: with Timeout(seconds=timeout): total_instances = sum([len(v["instances"]) for v in cluster_map.values()]) with progressbar.ProgressBar(maxval=total_instances) as bar: while True: for cluster, instances in cluster_map.items(): if cluster_map[cluster]['deployed'] != len(cluster_map[cluster]['instances']): cluster_map[cluster]['deployed'] = instances_deployed( cluster=cluster, service=service, instances=instances['instances'], git_sha=git_sha) if cluster_map[cluster]['deployed'] == len(cluster_map[cluster]['instances']): instance_csv = ", ".join(cluster_map[cluster]['instances']) print "Deploy to %s complete! (instances: %s)" % (cluster, instance_csv) bar.update(sum([v["deployed"] for v in cluster_map.values()])) if all([cluster['deployed'] == len(cluster["instances"]) for cluster in cluster_map.values()]): break else: time.sleep(10) except TimeoutError: human_status = ["{0}: {1}".format(cluster, data['deployed']) for cluster, data in cluster_map.items()] line = "\nCurrent deployment status of {0} per cluster:\n".format(deploy_group) + "\n".join(human_status) _log( service=service, component='deploy', line=line, level='event' ) line = "\n\nTimed out after {0} seconds, waiting for {1} in {2} to be deployed by PaaSTA. \n\n"\ "This probably means the deploy hasn't suceeded. The new service might not be healthy or one "\ "or more clusters could be having issues.\n\n"\ "To debug: try running 'paasta status -s {2} -vv' or 'paasta logs -s {2}' to determine the cause.\n\n"\ "{3} is still *marked* for deployment. To rollback, you can run: 'paasta rollback --service "\ "{2} --deploy-group {1}'\n\n"\ "If the service is known to be slow to start you may wish to increase "\ "the timeout on this step.".format(timeout, deploy_group, service, git_sha) _log( service=service, component='deploy', line=line, level='event' ) raise return True
def paasta_mark_for_deployment(args): """Mark a docker image for deployment""" service = args.service if service and service.startswith('services-'): service = service.split('services-', 1)[1] validate_service_name(service) cmd = build_command(args.git_url, args.commit, args.clusterinstance) # Clusterinstance should be in cluster.instance format cluster, instance = args.clusterinstance.split('.') returncode, output = _run( cmd, timeout=30, ) loglines = get_loglines(returncode=returncode, cmd=cmd, output=output, args=args) for logline in loglines: _log( service=service, line=logline, component='deploy', level='event', cluster=cluster, instance=instance, ) sys.exit(returncode)