def _format_parents_verbose(job): parents = job.get('parents', []) # create (service,instance) pairs for the parent names parent_service_instances = [ tuple(chronos_tools.decompose_job_id(parent)) for parent in parents ] # find matching parent jobs parent_jobs = [ chronos_tools.get_jobs_for_service_instance(*service_instance, include_disabled=True, include_temporary=False)[0] for service_instance in parent_service_instances ] # get the status of the last run of each parent job parent_statuses = [(parent, _format_last_result(job)) for parent in parent_jobs] formatted_lines = [("\n" " - %(job_name)s\n" " Last Run: %(status)s (%(last_run)s)" % { "job_name": parent['name'], "last_run": status_parent[1], "status": status_parent[0], }) for (parent, status_parent) in parent_statuses] return '\n'.join(formatted_lines)
def filter_paasta_jobs(jobs): """ Given a list of job name strings, return only those in the format PaaSTA expects. :param jobs: a list of job names. :returns: those job names in a format PaaSTA expects """ formatted = [] for job in jobs: try: # attempt to decompose it chronos_tools.decompose_job_id(job) formatted.append(job) except InvalidJobNameError: pass return formatted
def modify_command_for_date(chronos_job, date, verbose): """ Given a chronos job config, return a cloned job config where the command has been modified to reflect what it would have run as on a given date. :param chronos_job: a chronos job dictionary, as created by ``chronos_tools.create_complete_config`` :param date: a ``datetime.datetime`` object. :returns chronos_job: a chronos_job dict with the command modified to interpolate in the context of the date provided. """ current_command = chronos_job['command'] if current_command is not None: chronos_job['command'] = chronos_tools.parse_time_variables( command=current_command, parse_time=date, ) else: if verbose: job_name = ".".join( chronos_tools.decompose_job_id(chronos_job['name'])) paasta_print( f'command in job {job_name} is empty - skipping formatting and depending on command in image' ) return chronos_job
def filter_expired_tmp_jobs(client, job_names): """ Given a list of temporary jobs, find those ready to be removed. Their suitablity for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance, ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run( job) if last_run_state != chronos_tools.LastRunState.NotRun: if ((datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time)) > datetime.timedelta(days=1)): expired.append(job_name) return expired
def filter_expired_tmp_jobs(client, job_names, cluster, soa_dir): """ Given a list of temporary jobs, find those ready to be removed. Their suitability for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run(job) try: chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir ) interval = chronos_job_config.get_schedule_interval_in_seconds() or 0 except NoConfigurationForServiceError: # If we can't get the job's config, default to cleanup after 1 day interval = 0 if last_run_state != chronos_tools.LastRunState.NotRun: if ( datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time) ) > max( datetime.timedelta(seconds=interval), datetime.timedelta(days=1) ): expired.append(job_name) return expired
def filter_expired_tmp_jobs(client, job_names): """ Given a list of temporary jobs, find those ready to be removed. Their suitablity for removal is defined by two things: - the job has completed (irrespective of whether it was a success or failure) - the job completed more than 24 hours ago """ expired = [] for job_name in job_names: service, instance = chronos_tools.decompose_job_id(job_name) temporary_jobs = chronos_tools.get_temporary_jobs_for_service_instance( client=client, service=service, instance=instance ) for job in temporary_jobs: last_run_time, last_run_state = chronos_tools.get_status_last_run(job) if last_run_state != chronos_tools.LastRunState.NotRun: if ((datetime.datetime.now(dateutil.tz.tzutc()) - dateutil.parser.parse(last_run_time)) > datetime.timedelta(days=1)): expired.append(job_name) return expired
def main(): args = parse_args() config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) # get_chronos_jobs_for_cluster returns (service, job) expected_service_jobs = chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir) # filter jobs not related to paasta # and decompose into (service, instance, tag) tuples paasta_jobs = filter_paasta_jobs(deployed_job_names(client)) running_service_jobs = [chronos_tools.decompose_job_id(job) for job in paasta_jobs] to_delete = jobs_to_delete(expected_service_jobs, running_service_jobs) # recompose the job ids again for deletion to_delete_job_ids = [chronos_tools.compose_job_id(*job) for job in to_delete] task_responses = cleanup_tasks(client, to_delete_job_ids) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete_job_ids) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) if len(to_delete) == 0: print 'No Chronos Jobs to remove' else: if len(task_successes) > 0: print format_list_output("Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes]) # if there are any failures, print and exit appropriately if len(task_failures) > 0: print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]) if len(job_successes) > 0: print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]) # if there are any failures, print and exit appropriately if len(job_failures) > 0: print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def chronos_check_job_state(context, field, job_name, value): job_id = context.jobs[job_name]['name'] service, instance = chronos_tools.decompose_job_id(job_id) jobs = chronos_tools.lookup_chronos_jobs(service=service, instance=instance, client=context.chronos_client, include_disabled=True) assert len(jobs) == 1 # we cast to a string so you can correctly assert that a value is True/False assert str(jobs[0][field]) == value
def chronos_check_job_state(context, field, job_name, value): job_id = context.jobs[job_name]['name'] service, instance = chronos_tools.decompose_job_id(job_id) jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, client=context.chronos_client, include_disabled=True ) assert len(jobs) == 1 # we cast to a string so you can correctly assert that a value is True/False assert str(jobs[0][field]) == value
def format_chronos_job_status(client, job, running_tasks, verbose=0): """Given a job, returns a pretty-printed human readable output regarding the status of the job. :param job: dictionary of the job status :param running_tasks: a list of Mesos tasks associated with ``job``, e.g. the result of ``mesos_tools.get_running_tasks_from_active_frameworks()``. :param verbose: int verbosity level """ job_name = _format_job_name(job) is_temporary = chronos_tools.is_temporary_job(job) if 'name' in job else 'UNKNOWN' job_name = modify_string_for_rerun_status(job_name, is_temporary) disabled_state = _format_disabled_status(job) service, instance = chronos_tools.decompose_job_id(job['name']) chronos_state = chronos_tools.get_chronos_status_for_job(client, service, instance) (last_result, formatted_time) = _format_last_result(job) job_type = chronos_tools.get_job_type(job) schedule_type = _get_schedule_field_for_job_type(job_type) schedule_formatter = get_schedule_formatter(job_type, verbose) schedule_value = schedule_formatter(job) command = _format_command(job) mesos_status = _format_mesos_status(job, running_tasks) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) mesos_status_verbose = status_mesos_tasks_verbose( job_id=job["name"], get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) mesos_status = "%s\n%s" % (mesos_status, mesos_status_verbose) return ( "Job: %(job_name)s\n" " Status: %(disabled_state)s (%(chronos_state)s)" " Last: %(last_result)s (%(formatted_time)s)\n" " %(schedule_type)s: %(schedule_value)s\n" " Command: %(command)s\n" " Mesos: %(mesos_status)s" % { "job_name": job_name, "is_temporary": is_temporary, "schedule_type": schedule_type, "chronos_state": PaastaColors.grey(chronos_state), "disabled_state": disabled_state, "last_result": last_result, "formatted_time": formatted_time, "schedule_value": schedule_value, "command": command, "mesos_status": mesos_status, } )
def format_chronos_job_status(client, job, running_tasks, verbose=0): """Given a job, returns a pretty-printed human readable output regarding the status of the job. :param job: dictionary of the job status :param running_tasks: a list of Mesos tasks associated with ``job``, e.g. the result of ``mesos_tools.get_running_tasks_from_frameworks()``. :param verbose: int verbosity level """ job_name = _format_job_name(job) is_temporary = chronos_tools.is_temporary_job( job) if 'name' in job else 'UNKNOWN' job_name = modify_string_for_rerun_status(job_name, is_temporary) disabled_state = _format_disabled_status(job) service, instance = chronos_tools.decompose_job_id(job['name']) chronos_state = chronos_tools.get_chronos_status_for_job( client, service, instance) (last_result, formatted_time) = _format_last_result(job) job_type = chronos_tools.get_job_type(job) schedule_type = _get_schedule_field_for_job_type(job_type) schedule_formatter = get_schedule_formatter(job_type, verbose) schedule_value = schedule_formatter(job) command = _format_command(job) mesos_status = _format_mesos_status(running_tasks) if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) mesos_status_verbose = status_mesos_tasks_verbose( job_id=job["name"], get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) mesos_status = "%s\n%s" % (mesos_status, mesos_status_verbose) return ("Job: %(job_name)s\n" " Status: %(disabled_state)s (%(chronos_state)s)" " Last: %(last_result)s (%(formatted_time)s)\n" " %(schedule_type)s: %(schedule_value)s\n" " Command: %(command)s\n" " Mesos: %(mesos_status)s" % { "job_name": job_name, "is_temporary": is_temporary, "schedule_type": schedule_type, "chronos_state": PaastaColors.grey(chronos_state), "disabled_state": disabled_state, "last_result": last_result, "formatted_time": formatted_time, "schedule_value": schedule_value, "command": command, "mesos_status": mesos_status, })
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job( complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def chronos_check_running_tasks(context, job_name, has_or_not): # This uses an undocumented endpoint that should be replaced once it's possible # to get more detailed per-job task information from Chronos job_id = context.jobs[job_name]['name'] service, instance = chronos_tools.decompose_job_id(job_id) for _ in xrange(10): status = chronos_tools.get_chronos_status_for_job(context.chronos_client, service, instance) if has_or_not == "has no": if status == "idle": return else: # has_or_not should be "has" if status == "running" or status == "queued": return time.sleep(1) assert False
def chronos_check_running_tasks(context, job_name, has_or_not): # This uses an undocumented endpoint that should be replaced once it's possible # to get more detailed per-job task information from Chronos job_id = context.jobs[job_name]['name'] service, instance = chronos_tools.decompose_job_id(job_id) for _ in range(10): status = chronos_tools.get_chronos_status_for_job(context.chronos_client, service, instance) if has_or_not == "has no": if status == "idle": return else: # has_or_not should be "has" if status == "running" or status == "queued": return time.sleep(1) assert False
def chronos_check_job_state(context, old_or_new_job, disabled): desired_disabled = (disabled == 'disabled') if old_or_new_job == 'old job': job_id = context.old_chronos_job_name else: job_id = context.chronos_job_name (service, instance, git_hash, config_hash) = chronos_tools.decompose_job_id(job_id) jobs = chronos_tools.lookup_chronos_jobs( service=service, instance=instance, git_hash=git_hash, config_hash=config_hash, client=context.chronos_client, include_disabled=desired_disabled, ) assert len(jobs) == 1 for job in jobs: assert job['disabled'] == desired_disabled
def main(): args = parse_args() cluster = load_system_paasta_config().get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = load_system_paasta_config() chronos_job_config = chronos_tools.load_chronos_job_config( service, instance, system_paasta_config.get_cluster(), soa_dir=args.soa_dir) try: complete_job_config = chronos_tools.create_complete_config( service=service, job_name=instance, soa_dir=args.soa_dir, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for %s in cluster %s. Has Jenkins run for it?" % ( args.service_instance, cluster) print error_msg raise e except chronos_tools.UnknownChronosJobError as e: error_msg = ( "Could not read chronos configuration file for %s in cluster %s\n" % (args.service_instance, cluster) + "Error was: %s" % str(e)) print error_msg raise e except chronos_tools.InvalidParentError as e: raise e # complete_job_config is a formatted version # of the job, so the command is fornatted in the context # of 'now' # replace it with the 'original' cmd so it can be # re rendered original_command = chronos_job_config.get_cmd() complete_job_config['command'] = original_command clone = clone_job(complete_job_config, datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S")) client.add(clone)
def _format_parents_verbose(job): parents = job.get('parents', []) # create (service,instance) pairs for the parent names parent_service_instances = [tuple(chronos_tools.decompose_job_id(parent)) for parent in parents] # find matching parent jobs parent_jobs = [ chronos_tools.get_jobs_for_service_instance( *service_instance, include_disabled=True, include_temporary=False)[0] for service_instance in parent_service_instances ] # get the status of the last run of each parent job parent_statuses = [(parent, _format_last_result(job)) for parent in parent_jobs] formatted_lines = [("\n" " - %(job_name)s\n" " Last Run: %(status)s (%(last_run)s)" % { "job_name": parent['name'], "last_run": status_parent[1], "status": status_parent[0], }) for (parent, status_parent) in parent_statuses] return '\n'.join(formatted_lines)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) running_jobs = set(deployed_job_names(client)) expected_service_jobs = set([chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)]) all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) send_event( service=service, instance=instance, monitoring_overrides={}, soa_dir=soa_dir, status_code=pysensu_yelp.Status.OK, message="This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: print 'No Chronos Jobs to remove' else: if len(task_successes) > 0: print format_list_output("Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes]) # if there are any failures, print and exit appropriately if len(task_failures) > 0: print format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures]) if len(job_successes) > 0: print format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes]) # if there are any failures, print and exit appropriately if len(job_failures) > 0: print format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures]) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def main(): args = parse_args() soa_dir = args.soa_dir config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) system_paasta_config = utils.load_system_paasta_config() cluster = system_paasta_config.get_cluster() running_jobs = set(deployed_job_names(client)) expected_service_jobs = {chronos_tools.compose_job_id(*job) for job in chronos_tools.get_chronos_jobs_for_cluster(soa_dir=args.soa_dir)} all_tmp_jobs = set(filter_tmp_jobs(filter_paasta_jobs(running_jobs))) expired_tmp_jobs = set(filter_expired_tmp_jobs(client, all_tmp_jobs, cluster=cluster, soa_dir=soa_dir)) valid_tmp_jobs = all_tmp_jobs - expired_tmp_jobs to_delete = running_jobs - expected_service_jobs - valid_tmp_jobs task_responses = cleanup_tasks(client, to_delete) task_successes = [] task_failures = [] for response in task_responses: if isinstance(response[-1], Exception): task_failures.append(response) else: task_successes.append(response) job_responses = cleanup_jobs(client, to_delete) job_successes = [] job_failures = [] for response in job_responses: if isinstance(response[-1], Exception): job_failures.append(response) else: job_successes.append(response) try: (service, instance) = chronos_tools.decompose_job_id(response[0]) monitoring_tools.send_event( check_name=check_chronos_job_name(service, instance), service=service, overrides={}, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, output="This instance was removed and is no longer supposed to be scheduled.", ) except InvalidJobNameError: # If we deleted some bogus job with a bogus jobid that could not be parsed, # Just move on, no need to send any kind of paasta event. pass if len(to_delete) == 0: paasta_print('No Chronos Jobs to remove') else: if len(task_successes) > 0: paasta_print(format_list_output( "Successfully Removed Tasks (if any were running) for:", [job[0] for job in task_successes], )) # if there are any failures, print and exit appropriately if len(task_failures) > 0: paasta_print(format_list_output("Failed to Delete Tasks for:", [job[0] for job in task_failures])) if len(job_successes) > 0: paasta_print(format_list_output("Successfully Removed Jobs:", [job[0] for job in job_successes])) # if there are any failures, print and exit appropriately if len(job_failures) > 0: paasta_print(format_list_output("Failed to Delete Jobs:", [job[0] for job in job_failures])) if len(job_failures) > 0 or len(task_failures) > 0: sys.exit(1)
def main(): args = parse_args() system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() service, instance = chronos_tools.decompose_job_id(args.service_instance) config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(config) related_jobs = chronos_tools.get_related_jobs_configs(cluster, service, instance, soa_dir=args.soa_dir) if not related_jobs: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( args.service_instance, cluster, ) paasta_print(error_msg) raise NoDeploymentsAvailable if not args.run_all_related_jobs: # Strip all the configuration for the related services # those information will not be used by the rest of the flow related_jobs = { (service, instance): related_jobs[(service, instance)], } complete_job_configs = {} for (srv, inst) in related_jobs: try: complete_job_configs.update( { (srv, inst): chronos_tools.create_complete_config( service=srv, job_name=inst, soa_dir=args.soa_dir, ), }, ) except (NoDeploymentsAvailable, NoDockerImageError) as e: error_msg = "No deployment found for {} in cluster {}. Has Jenkins run for it?".format( chronos_tools.compose_job_id(srv, inst), cluster, ) paasta_print(error_msg) raise e except NoConfigurationForServiceError as e: error_msg = ( "Could not read chronos configuration file for {} in cluster {}\nError was: {}" .format( chronos_tools.compose_job_id(srv, inst), cluster, str(e), )) paasta_print(error_msg) raise e except chronos_tools.InvalidParentError as e: raise e if not args.run_all_related_jobs: sorted_jobs = [(service, instance)] else: sorted_jobs = chronos_tools.topological_sort_related_jobs( cluster, service, instance, soa_dir=args.soa_dir) timestamp = datetime.datetime.utcnow().isoformat() chronos_to_add = [] for (service, instance) in sorted_jobs: # complete_job_config is a formatted version of the job, # so the command is formatted in the context of 'now' # replace it with the 'original' cmd so it can be re rendered chronos_job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=cluster, soa_dir=args.soa_dir, ) original_command = chronos_job_config.get_cmd() complete_job_config = complete_job_configs[(service, instance)] complete_job_config['command'] = original_command clone = clone_job( chronos_job=complete_job_config, timestamp=timestamp, force_disabled=args.force_disabled, ) # modify the command to run commands for a given date clone = modify_command_for_date( chronos_job=clone, date=datetime.datetime.strptime(args.execution_date, "%Y-%m-%dT%H:%M:%S"), verbose=args.verbose, ) if not args.run_all_related_jobs and chronos_tools.get_job_type( clone) == chronos_tools.JobType.Dependent: # If the job is a dependent job and we want to re-run only the specific instance # remove the parents and update the schedule to start the job as soon as possible clone = set_default_schedule(remove_parents(clone)) chronos_to_add.append(clone) for job_to_add in chronos_to_add: client.add(job_to_add)