def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() self.job_template = job_template msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[i] multi_device_playbook = play_info.multi_device_playbook if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class ExecutableManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils ): self._logger = logger self.vnc_api = vnc_api self.vnc_api_init_params = None self.api_server_host = None self.auth_token = None self.contrail_cluster_id = None self.sandesh_args = None self.job_log_utils = job_log_utils self.job_input = job_input self.job_utils = None self.executable_timeout = 1800 self.job_template = None self.job_execution_id = None self.job_template_id = None self.result_handler = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self.vnc_api) self.job_template = self.job_utils.read_job_template() self.job_file_write = JobFileWrite(self._logger) def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.auth_token = job_input_json.get('auth_token') self.contrail_cluster_id = job_input_json.get('contrail_cluster_id') self.sandesh_args = job_input_json.get('args') self.vnc_api_init_params = job_input_json.get('vnc_api_init_params') self.api_server_host = job_input_json.get('api_server_host') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage( MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.error("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def gather_job_args(self): extra_vars = { 'input': self.job_data, 'job_template_id': self.job_template.get_uuid(), 'job_template_fqname': self.job_template.fq_name, 'fabric_fq_name': self.fabric_fq_name, 'auth_token': self.auth_token, 'contrail_cluster_id': self.contrail_cluster_id, 'api_server_host': self.api_server_host, 'job_execution_id': self.job_execution_id , 'sandesh_args': self.sandesh_args, 'vnc_api_init_params': self.vnc_api_init_params, } return extra_vars def start_job(self): self._logger.info("Starting Executable") job_error_msg = None job_template = self.job_template try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) executable_list = job_template.get_job_template_executables()\ .get_executable_info() for executable in executable_list: exec_path = executable.get_executable_path() exec_args = executable.get_executable_args() job_input_args = self.gather_job_args() try: exec_process = subprocess32.Popen([exec_path, "--job-input", json.dumps(job_input_args), '--debug', 'True'], close_fds=True, cwd='/', stdout=subprocess32.PIPE, stderr=subprocess32.PIPE) self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "INPROGRESS"}) msg = "Child process pid = " + str(exec_process.pid) self._logger.info(msg) (out, err) = exec_process.communicate(timeout=self.executable_timeout) self._logger.notice(str(out)) self._logger.notice(str(err)) except subprocess32.TimeoutExpired as timeout_exp: if exec_process is not None: os.kill(exec_process.pid, 9) msg = MsgBundle.getMessage( MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT, exec_path=exec_path, exc_msg=repr(timeout_exp)) raise JobException(msg, self.job_execution_id) self._logger.info(exec_process.returncode) self._logger.info("Executable Completed") if exec_process.returncode != 0: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "FAILED"}) msg = MsgBundle.getMessage(MsgBundle. EXECUTABLE_RETURN_WITH_ERROR, exec_uri=exec_path) self._logger.error(msg) else: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "COMPLETED"}) except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): """Initializes workflow manager.""" self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.job_log_utils = job_log_utils self.job_execution_id = None self.job_template_id = None self.device_json = None self.result_handler = None self.job_data = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client self.job_mgr = None self.job_template = None self.abort_flag = False signal.signal(signal.SIGABRT, self.job_mgr_abort_signal_handler) signal.signal(signal.SIGUSR1, self.job_mgr_abort_signal_handler) logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage( MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage( MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() self.job_template = job_template msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[i] multi_device_playbook = play_info.multi_device_playbook if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg) def job_mgr_abort_signal_handler(self, signalnum, frame): if signalnum == signal.SIGABRT: # Force abort; kill all playbooks, then exit err_msg = "Job aborting..." self._logger.info(err_msg) try: self.job_mgr.job_handler.playbook_abort() self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(self.job_template.fq_name) sys.exit() except Exception as ex: self._logger.error("Failed to force abort") elif signalnum == signal.SIGUSR1: # Graceful abort; Exit after current playbook self._logger.info("Job will abort upon playbook completion...") self.abort_flag = True
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id) self._logger.debug(msg) self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent) job_mgr.start_job() # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: self._logger.error( "Stop the workflow on the failed Playbook.") break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.get('device_json') self.job_input['device_json'] = device_json if not self.job_input.get('prev_pb_output'): self.job_input['prev_pb_output'] = pb_output else: self.job_input['prev_pb_output'].update(pb_output) self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): self._logger.info("Starting Executable") job_error_msg = None job_template = self.job_template try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) executable_list = job_template.get_job_template_executables()\ .get_executable_info() for executable in executable_list: exec_path = executable.get_executable_path() exec_args = executable.get_executable_args() job_input_args = self.gather_job_args() try: exec_process = subprocess32.Popen([exec_path, "--job-input", json.dumps(job_input_args), '--debug', 'True'], close_fds=True, cwd='/', stdout=subprocess32.PIPE, stderr=subprocess32.PIPE) self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "INPROGRESS"}) msg = "Child process pid = " + str(exec_process.pid) self._logger.info(msg) (out, err) = exec_process.communicate(timeout=self.executable_timeout) self._logger.notice(str(out)) self._logger.notice(str(err)) except subprocess32.TimeoutExpired as timeout_exp: if exec_process is not None: os.kill(exec_process.pid, 9) msg = MsgBundle.getMessage( MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT, exec_path=exec_path, exc_msg=repr(timeout_exp)) raise JobException(msg, self.job_execution_id) self._logger.info(exec_process.returncode) self._logger.info("Executable Completed") if exec_process.returncode != 0: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "FAILED"}) msg = MsgBundle.getMessage(MsgBundle. EXECUTABLE_RETURN_WITH_ERROR, exec_uri=exec_path) self._logger.error(msg) else: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "COMPLETED"}) except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils): self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.job_log_utils = job_log_utils self.job_execution_id = None self.job_template_id = None self.device_json = None self.result_handler = None self.job_data = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json['job_execution_id'] self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, str): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id) self._logger.debug(msg) self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent) job_mgr.start_job() # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: self._logger.error( "Stop the workflow on the failed Playbook.") break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.get('device_json') self.job_input['device_json'] = device_json if not self.job_input.get('prev_pb_output'): self.job_input['prev_pb_output'] = pb_output else: self.job_input['prev_pb_output'].update(pb_output) self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client, self.db_init_params, self.cluster_id) job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices: break self.job_input['device_json'] = retry_devices # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if self.job_input.get('device_json') is None or\ len(self.result_handler.failed_device_jobs)\ == len(self.job_input.get('device_json')): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of the device jobs # have failed. This means we should still declare # the operation as success. We declare workflow as # success even if one of the devices has succeeded the job self.result_handler.job_result_status = JobStatus.SUCCESS # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log job_template = self.job_utils.read_job_template() self.job_template = job_template self.job_description = self.job_template.display_name if not self.job_transaction_descr: self.job_transaction_descr = self._generate_transaction_descr() self.result_handler = JobResultHandler( self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils, self.device_name, self.job_description, self.job_transaction_id, self.job_transaction_descr) msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log( job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp, device_name=self.device_name, description=self.job_description, transaction_id=self.job_transaction_id, transaction_descr=self.job_transaction_descr) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] cleanup_in_progress = False cleanup_completed = False pb_idx = 0 while pb_idx < len(playbook_list): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[pb_idx] multi_device_playbook = play_info.multi_device_playbook playbook_name = play_info.playbook_uri.split('/')[-1] if cleanup_in_progress: # If we need to cleanup due to a previous error, ignore # any playbooks that don't perform recovery if not play_info.recovery_playbook: self._logger.info("Ignoring playbook %s since it " "does not perform recovery" % playbook_name) pb_idx += 1 continue # If we are running a recovery playbook, then # cleanup_completed needs to be set irrespective of # a success or error in recovery playbook execution else: self._logger.info("Running recovery playbook %s" % playbook_name) cleanup_completed = True else: # Don't run a recovery playbook if we haven't hit an error if play_info.recovery_playbook: self._logger.info( "Ignoring recovery playbook %s since we " "haven't hit an error" % playbook_name) pb_idx += 1 continue if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=pb_idx + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, pb_idx, job_percent, self._zk_client, self.job_description, self.job_transaction_id, self.job_transaction_descr) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() failed_device_list = self.result_handler\ .get_failed_device_list() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices self.job_input['input']['failed_list'] = failed_device_list # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # If it is a single device job or # if it is a multi device playbook # and all the devices have failed some job execution, # declare it as failure, perform cleanup if possible # and then stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): if not cleanup_in_progress: cleanup_in_progress = True pb_idx = 0 self._logger.info("Stop the workflow on the failed" " Playbook and start cleanup") else: pb_idx += 1 continue elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status( JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) pb_idx += 1 # A successful recovery playbook execution might # set JobStatus to success but this does not indicate a # success in the workflow. Set JobStatus to failure again. if cleanup_completed: err_msg = "Finished cleaning up after the error" self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) cleanup_completed = False cleanup_in_progress = False # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): """Initialize workflow manager.""" self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.job_log_utils = job_log_utils self.job_execution_id = None self.job_description = None self.job_transaction_id = None self.job_transaction_descr = None self.job_template_id = None self.device_json = None self.device_name = "" self.result_handler = None self.job_data = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client self.job_mgr = None self.job_template = None self.abort_flag = False signal.signal(signal.SIGABRT, self.job_mgr_abort_signal_handler) signal.signal(signal.SIGUSR1, self.job_mgr_abort_signal_handler) logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.device_json = job_input_json.get('device_json') self.job_description = job_input_json.get('job_description', "") self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_transaction_id = \ job_input_json.get('job_transaction_id', self.job_execution_id) self.job_transaction_descr = \ job_input_json.get('job_transaction_descr') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.device_name = self._get_device_name() def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def _generate_transaction_descr(self): transaction_descr = self.job_template.display_name if self.device_json: transaction_descr += " for " device_uuid_list = list(self.device_json.keys()) for device_uuid in device_uuid_list: device_info = self.device_json[device_uuid] device_fqname = device_info.get('device_fqname') if device_fqname: device_name = device_fqname[-1] transaction_descr += device_name + " " if len(device_uuid_list) == 1: self.device_name = device_name return transaction_descr def _get_device_name(self): if self.device_json: device_uuid_list = list(self.device_json.keys()) if len(device_uuid_list) == 1: device_info = self.device_json[device_uuid_list[0]] device_fqname = device_info.get('device_fqname') if device_fqname: return device_fqname[-1] return "" def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log job_template = self.job_utils.read_job_template() self.job_template = job_template self.job_description = self.job_template.display_name if not self.job_transaction_descr: self.job_transaction_descr = self._generate_transaction_descr() self.result_handler = JobResultHandler( self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils, self.device_name, self.job_description, self.job_transaction_id, self.job_transaction_descr) msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log( job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp, device_name=self.device_name, description=self.job_description, transaction_id=self.job_transaction_id, transaction_descr=self.job_transaction_descr) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] cleanup_in_progress = False cleanup_completed = False pb_idx = 0 while pb_idx < len(playbook_list): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[pb_idx] multi_device_playbook = play_info.multi_device_playbook playbook_name = play_info.playbook_uri.split('/')[-1] if cleanup_in_progress: # If we need to cleanup due to a previous error, ignore # any playbooks that don't perform recovery if not play_info.recovery_playbook: self._logger.info("Ignoring playbook %s since it " "does not perform recovery" % playbook_name) pb_idx += 1 continue # If we are running a recovery playbook, then # cleanup_completed needs to be set irrespective of # a success or error in recovery playbook execution else: self._logger.info("Running recovery playbook %s" % playbook_name) cleanup_completed = True else: # Don't run a recovery playbook if we haven't hit an error if play_info.recovery_playbook: self._logger.info( "Ignoring recovery playbook %s since we " "haven't hit an error" % playbook_name) pb_idx += 1 continue if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=pb_idx + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, pb_idx, job_percent, self._zk_client, self.job_description, self.job_transaction_id, self.job_transaction_descr) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() failed_device_list = self.result_handler\ .get_failed_device_list() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices self.job_input['input']['failed_list'] = failed_device_list # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # If it is a single device job or # if it is a multi device playbook # and all the devices have failed some job execution, # declare it as failure, perform cleanup if possible # and then stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): if not cleanup_in_progress: cleanup_in_progress = True pb_idx = 0 self._logger.info("Stop the workflow on the failed" " Playbook and start cleanup") else: pb_idx += 1 continue elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status( JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) pb_idx += 1 # A successful recovery playbook execution might # set JobStatus to success but this does not indicate a # success in the workflow. Set JobStatus to failure again. if cleanup_completed: err_msg = "Finished cleaning up after the error" self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) cleanup_completed = False cleanup_in_progress = False # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg) def job_mgr_abort_signal_handler(self, signalnum, frame): if signalnum == signal.SIGABRT: # Force abort; kill all playbooks, then exit err_msg = "Job aborting..." self._logger.info(err_msg) try: self.job_mgr.job_handler.playbook_abort() self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log( self.job_template.fq_name) sys.exit() except Exception: self._logger.error("Failed to force abort") elif signalnum == signal.SIGUSR1: # Graceful abort; Exit after current playbook self._logger.info("Job will abort upon playbook completion...") self.abort_flag = True
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.job_log_utils = job_log_utils self.job_execution_id = None self.job_template_id = None self.device_json = None self.result_handler = None self.job_data = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage( MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage( MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[i] multi_device_playbook = play_info.multi_device_playbook if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client) job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices: break self.job_input['device_json'] = retry_devices # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == \ len(self.job_input.get('device_json'))): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of the device jobs # have failed. This means we should still declare # the operation as success. We declare workflow as # success even if one of the devices has succeeded the job self.result_handler.job_result_status = JobStatus.SUCCESS # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)