コード例 #1
0
class ObjectLogUtil(object):
    def __init__(self, job_ctx):
        self.job_ctx = job_ctx
        self.results = dict()
        self.results['failed'] = False
        logging.basicConfig(level=logging.INFO)
        self.validate_job_ctx()

        self.job_log_util = JobLogUtils(sandesh_instance_id=str(uuid.uuid4()),
                                        config_args=json.dumps(
                                            job_ctx['config_args']))

    def validate_job_ctx(self):
        required_job_ctx_keys = [
            'job_template_fqname', 'job_execution_id', 'config_args',
            'job_input'
        ]
        for key in required_job_ctx_keys:
            if key not in self.job_ctx or self.job_ctx.get(key) is None:
                raise ValueError("Missing job context param: %s" % key)

    def send_prouter_object_log(self, prouter_fqname, onboarding_state,
                                os_version, serial_num):
        self.job_log_util.send_prouter_object_log(
            prouter_fqname, self.job_ctx['job_execution_id'],
            json.dumps(self.job_ctx['job_input']),
            self.job_ctx['job_template_fqname'], onboarding_state, os_version,
            serial_num)

    def send_job_object_log(self, message, status, job_result):
        self.job_log_util.send_job_log(self.job_ctx['job_template_fqname'],
                                       self.job_ctx['job_execution_id'],
                                       message, status, job_result)

    def close_sandesh_conn(self):
        try:
            sandesh_util = SandeshUtils(self.job_log_util.get_config_logger())
            sandesh_util.close_sandesh_connection()
        except Exception as e:
            logging.error("Unable to close sandesh connection: %s", str(e))
コード例 #2
0
def send_job_object_log(job_ctx,
                        message,
                        status,
                        result):
    results = {}
    results['failed'] = False

    results = validate_job_ctx(job_ctx, results)
    if results['failed']:
        return results

    try:
        job_log_util = JobLogUtils(
            sandesh_instance_id=str(
                uuid.uuid4()), config_args=json.dumps(
                job_ctx['config_args']))
        job_log_util.send_job_log(
            job_ctx['job_template_fqname'],
            job_ctx['job_execution_id'],
            message,
            status,
            result)
        time.sleep(10)
    except Exception as ex:
        msg = "Failed to create following job log due to error: %s\n\t \
              job name: %s\n\t \
              job execution id: %s\n\t \
              job status: %s\n\t, \
              log message: %s\n" \
              % (str(ex), job_ctx['job_template_fqname'],
                 job_ctx['job_execution_id'], status, message)

        results['msg'] = msg
        results['failed'] = True

    return results
コード例 #3
0
        sys.exit("Exiting due to logger initialization error: %s" % repr(exp))

    # initialize _vnc_api instance
    vnc_api = None
    try:
        auth_token = job_input_json['auth_token']
        vnc_api = VncApi(auth_token=auth_token)
        logger.info("VNC api is initialized using the auth token passed.")
    except Exception as exp:
        logger.error(
            MsgBundle.getMessage(MsgBundle.VNC_INITIALIZATION_ERROR,
                                 exc_msg=traceback.format_exc()))
        msg = MsgBundle.getMessage(MsgBundle.VNC_INITIALIZATION_ERROR,
                                   exc_msg=repr(exp))
        job_log_utils.send_job_log(job_input_json['job_template_fq_name'],
                                   job_input_json['job_execution_id'],
                                   job_input_json.get('fabric_fq_name'), msg,
                                   JobStatus.FAILURE)
        sys.exit(msg)

    # invoke job manager
    try:
        workflow_manager = WFManager(logger, vnc_api, job_input_json,
                                     job_log_utils)
        logger.info("Job Manager is initialized. Starting job.")
        workflow_manager.start_job()
    except Exception as exp:
        logger.error(
            MsgBundle.getMessage(MsgBundle.JOB_ERROR,
                                 exc_msg=traceback.format_exc()))
        msg = MsgBundle.getMessage(MsgBundle.JOB_ERROR, exc_msg=repr(exp))
        job_log_utils.send_job_log(job_input_json['job_template_fq_name'],
コード例 #4
0
class DeviceJobManager(object):

    JOB_REQUEST_EXCHANGE = "job_request_exchange"
    JOB_REQUEST_CONSUMER = "job_request_consumer"
    JOB_REQUEST_ROUTING_KEY = "job.request"
    JOB_ABORT_CONSUMER = "job_abort_consumer"
    JOB_ABORT_ROUTING_KEY = "job.abort"
    JOB_STATUS_EXCHANGE = "job_status_exchange"
    JOB_STATUS_CONSUMER = "job_status_consumer."
    JOB_STATUS_ROUTING_KEY = "job.status."
    JOB_STATUS_TTL = 5 * 60
    FABRIC_ZK_LOCK = "fabric-job-monitor"

    _instance = None

    def __init__(self, amqp_client, zookeeper_client, args, dm_logger):
        """Initialize ZooKeeper, RabbitMQ, Sandesh, DB conn etc."""
        DeviceJobManager._instance = self
        self._amqp_client = amqp_client
        self._zookeeper_client = zookeeper_client
        self._args = args
        self._job_mgr_statistics = {
            'max_job_count': self._args.max_job_count,
            'running_job_count': 0
        }
        # dict of exec_id:job_status (key/value pairs)
        self.job_status = {}
        # map of running job instances. Key is the pid and value is job
        # instance info
        self._job_mgr_running_instances = {}

        job_args = {
            'collectors': self._args.collectors,
            'fabric_ansible_conf_file': self._args.fabric_ansible_conf_file,
            'host_ip': self._args.host_ip,
            'zk_server_ip': self._args.zk_server_ip,
            'cluster_id': self._args.cluster_id
        }
        self._job_args = json.dumps(job_args)

        # initialize the job logger
        self._job_log_utils = JobLogUtils(
            sandesh_instance_id="DeviceJobManager" + str(time.time()),
            config_args=self._job_args,
            sandesh_instance=dm_logger._sandesh)
        self._logger = self._job_log_utils.config_logger
        self._sandesh = self._logger._sandesh

        self._db_conn = self._initialize_db_connection(args, dm_logger)

        self._amqp_client.add_exchange(self.JOB_STATUS_EXCHANGE, type='direct')
        # add dummy consumer to initialize the exchange
        self._amqp_client.add_consumer(
            self.JOB_STATUS_CONSUMER + "dummy",
            self.JOB_STATUS_EXCHANGE,
            routing_key=self.JOB_STATUS_ROUTING_KEY + "dummy",
            auto_delete=True)

        self._amqp_client.add_exchange(self.JOB_REQUEST_EXCHANGE,
                                       type='direct')
        self._amqp_client.add_consumer(
            self.JOB_REQUEST_CONSUMER,
            self.JOB_REQUEST_EXCHANGE,
            routing_key=self.JOB_REQUEST_ROUTING_KEY,
            callback=self.handle_execute_job_request)

        abort_q_name = '.'.join(
            [self.JOB_ABORT_CONSUMER,
             socket.getfqdn(self._args.host_ip)])
        self._amqp_client.add_consumer(abort_q_name,
                                       self.JOB_REQUEST_EXCHANGE,
                                       routing_key=self.JOB_ABORT_ROUTING_KEY,
                                       callback=self.handle_abort_job_request)

    # end __init__

    @classmethod
    def get_instance(cls):
        return cls._instance

    # end get_instance

    @classmethod
    def destroy_instance(cls):
        inst = cls.get_instance()
        if not inst:
            return
        cls._instance = None

    # end destroy_instance

    def _initialize_db_connection(self, args, dm_logger):
        credential = None
        if args.cassandra_user and args.cassandra_password:
            credential = {
                'username': args.cassandra_user,
                'password': args.cassandra_password
            }

        timeout = int(args.job_manager_db_conn_retry_timeout)
        max_retries = int(args.job_manager_db_conn_max_retries)

        retry_count = 1
        while True:
            try:
                return VncObjectDBClient(args.cassandra_server_list,
                                         args.cluster_id,
                                         None,
                                         None,
                                         dm_logger.log,
                                         credential=credential,
                                         ssl_enabled=args.cassandra_use_ssl,
                                         ca_certs=args.cassandra_ca_certs)
            except Exception as e:
                if retry_count >= max_retries:
                    raise e
                self._logger.warning("Error while initializing db connection, "
                                     "retrying: %s" % str(e))
                gevent.sleep(timeout)
            finally:
                retry_count = retry_count + 1

    # end _initialize_db_connection

    def db_read(self, obj_type, obj_id, obj_fields=None, ret_readonly=False):
        try:
            (ok, cassandra_result) = self._db_conn.object_read(
                obj_type, [obj_id], obj_fields, ret_readonly=ret_readonly)
        except NoIdError as e:
            # if NoIdError is for obj itself (as opposed to say for parent
            # or ref), let caller decide if this can be handled gracefully
            # by re-raising
            if e._unknown_id == obj_id:
                raise

            return (False, str(e))

        return (ok, cassandra_result[0])

    # end db_read

    def is_max_job_threshold_reached(self):
        if self._job_mgr_statistics.get('running_job_count') < \
                self._job_mgr_statistics.get('max_job_count'):
            return False
        return True

    # end is_max_job_threshold_reached

    def publish_job_status_notification(self, job_execution_id, status):
        try:
            msg = {'job_execution_id': job_execution_id, 'job_status': status}
            self._amqp_client.publish(msg,
                                      self.JOB_STATUS_EXCHANGE,
                                      routing_key=self.JOB_STATUS_ROUTING_KEY +
                                      job_execution_id,
                                      serializer='json',
                                      retry=True,
                                      retry_policy={
                                          'max_retries': 5,
                                          'interval_start': 2,
                                          'interval_step': 3,
                                          'interval_max': 15
                                      },
                                      expiration=self.JOB_STATUS_TTL)
        except Exception:
            self._logger.error("Failed to send job status change notification"
                               " %s %s" % (job_execution_id, status))

    # end publish_job_status_notification

    def get_job_template_id(self, job_template_fq_name):
        try:
            return self._db_conn.fq_name_to_uuid("job_template",
                                                 job_template_fq_name)
        except Exception as e:
            msg = "Error while reading job_template_id: " + str(e)
            self._logger.error(msg)
            raise

    # end get_job_template_id

    def handle_execute_job_request(self, body, message):
        job_input_params = None
        # check if the max job processing threshold is reached
        if not self.is_max_job_threshold_reached():
            job_input_params = body
            message.ack()
        else:
            # requeue the message if the max threshold is reached, to be picked
            # by another job manager or wait until the current job mgr is free
            message.reject(requeue=True)

        update_uve_on_failure = False
        device_list = None
        extra_params = job_input_params.get('params')
        if extra_params is not None:
            device_list = extra_params.get('device_list')

        is_delete = job_input_params.get('input').get('is_delete')
        job_execution_id = job_input_params.get('job_execution_id')
        job_template_fq_name = job_input_params.get('job_template_fq_name')
        job_template_id = job_input_params.get('job_template_id')

        fabric_fq_name = None
        fabric_job_uve_name = ''
        job_input_params['vnc_api_init_params'] = {
            "admin_user": self._args.admin_user,
            "admin_password": self._args.admin_password,
            "admin_tenant_name": self._args.admin_tenant_name,
            "api_server_port": self._args.api_server_port,
            "api_server_use_ssl": self._args.api_server_use_ssl
        }

        try:

            # populate job template id if not present in input_param
            if job_template_id is None:
                job_template_id = self.get_job_template_id(
                    job_template_fq_name)
                job_input_params["job_template_id"] = job_template_id

            # read the device object and pass the necessary data to the job
            if device_list:
                self.read_device_data(device_list, job_input_params,
                                      job_execution_id, is_delete)
            else:
                self.read_fabric_data(job_input_params, job_execution_id,
                                      is_delete)

            # read the job concurrency level from job template
            job_concurrency = self.get_job_concurrency(job_template_id,
                                                       job_execution_id)
            job_input_params['job_concurrency'] = job_concurrency

            fabric_fq_name = job_input_params.get('fabric_fq_name')
            fabric_job_uve_name_list = job_template_fq_name
            fabric_job_uve_name_list.insert(0, fabric_fq_name)
            fabric_job_uve_name = ':'.join(map(str, fabric_job_uve_name_list))

            device_fqnames = []

            # create the UVE
            if fabric_fq_name != "__DEFAULT__" and not device_list:
                self.create_fabric_job_uve(
                    fabric_job_uve_name,
                    job_input_params.get('job_execution_id'),
                    JobStatus.STARTING.value, 0.0)
            if device_list:
                device_fqnames = self.create_physical_router_job_uve(
                    device_list, job_input_params, fabric_job_uve_name,
                    JobStatus.STARTING.value, 0.0)

            # after creating the UVE, flag indicates to update the
            # UVE upon any failures
            update_uve_on_failure = True

            # check if there is any other job running for the fabric
            if job_concurrency is not None and job_concurrency == "fabric":
                existing_job = self._is_existing_job_for_fabric(
                    fabric_fq_name, job_execution_id)
                if existing_job:
                    msg = "Another job for the same fabric is in" \
                          " progress. Please wait for the job to finish"
                    self.mark_failure(msg,
                                      job_template_fq_name,
                                      job_execution_id,
                                      fabric_fq_name,
                                      mark_uve=True,
                                      device_list=device_list,
                                      fabric_job_uve_name=fabric_job_uve_name,
                                      job_params=job_input_params)
                    return

            start_time = time.time()
            signal_var = {
                'fabric_name': fabric_job_uve_name,
                'fabric_fq_name': fabric_fq_name,
                'start_time': start_time,
                'exec_id': job_execution_id,
                'device_fqnames': device_fqnames,
                'job_concurrency': job_concurrency
            }

            self.job_status.update(
                {job_execution_id: JobStatus.STARTING.value})
            self.publish_job_status_notification(job_execution_id,
                                                 JobStatus.STARTING.value)

            # handle process exit signal
            signal.signal(signal.SIGCHLD, self.job_mgr_signal_handler)

            # write the abstract config to file if needed
            self.save_abstract_config(job_input_params)

            # add params needed for sandesh connection
            job_input_params['args'] = self._job_args

            # create job manager subprocess
            job_mgr_path = os.path.dirname(
                __file__) + "/../job_manager/job_mgr.py"
            job_process = subprocess.Popen(
                ["python", job_mgr_path, "-i",
                 json.dumps(job_input_params)],
                cwd="/",
                close_fds=True)

            self._job_mgr_running_instances[str(job_process.pid)] = signal_var

            self._job_mgr_statistics['running_job_count'] = len(
                self._job_mgr_running_instances)

            self._logger.notice("Created job manager process. Execution id: "
                                "%s" % job_execution_id)

            self._logger.info(
                "Current number of job_mgr processes running %s" %
                self._job_mgr_statistics.get('running_job_count'))
        except Exception as e:
            msg = "Exception while processing the job request %s %s %s : " \
                  "%s %s" % (job_template_fq_name, job_execution_id,
                             fabric_fq_name, repr(e), traceback.format_exc())
            self.mark_failure(msg,
                              job_template_fq_name,
                              job_execution_id,
                              fabric_fq_name,
                              mark_uve=update_uve_on_failure,
                              device_list=device_list,
                              fabric_job_uve_name=fabric_job_uve_name,
                              job_params=job_input_params)

    # end hantdle_execute_job_request

    def handle_abort_job_request(self, body, message):
        inp = body.get('input')
        job_execution_id = inp.get('job_execution_id')
        abort_mode = inp.get('abort_mode')
        self._logger.info("Abort job request: job_id={}, mode={}".format(
            job_execution_id, abort_mode))

        # Search through running job instances to find this job
        for pid, job_instance in self._job_mgr_running_instances.iteritems():
            if job_instance.get('exec_id') == job_execution_id:
                self._logger.info(
                    "ABORT: pid={}, job_instance={}, mode={}".format(
                        pid, job_instance, abort_mode))
                # Force abort or graceful abort
                os.kill(
                    int(pid), signal.SIGABRT
                    if abort_mode == "force" else signal.SIGUSR1)
                break

    # end handle_abort_job_request

    def create_fabric_job_uve(self, fabric_job_uve_name, execution_id,
                              job_status, percentage_completed):
        job_execution_data = FabricJobExecution(
            name=fabric_job_uve_name,
            execution_id=execution_id,
            job_start_ts=int(round(time.time() * 1000)),
            job_status=job_status,
            percentage_completed=percentage_completed)
        job_execution_uve = FabricJobUve(data=job_execution_data,
                                         sandesh=self._sandesh)
        job_execution_uve.send(sandesh=self._sandesh)

    # end create_fabric_job_uve

    def create_physical_router_job_uve(self, device_list, job_input_params,
                                       fabric_job_uve_name, job_status,
                                       percentage_completed):
        device_fqnames = []
        for device_id in device_list:
            device_fqname = job_input_params.get('device_json').get(
                device_id).get('device_fqname')
            device_fqname = ':'.join(map(str, device_fqname))
            prouter_uve_name = device_fqname + ":" + \
                fabric_job_uve_name

            prouter_job_data = PhysicalRouterJobExecution(
                name=prouter_uve_name,
                execution_id=job_input_params.get('job_execution_id'),
                job_start_ts=int(round(time.time() * 1000)),
                job_status=job_status,
                percentage_completed=percentage_completed)

            prouter_job_uve = PhysicalRouterJobUve(data=prouter_job_data,
                                                   sandesh=self._sandesh)
            prouter_job_uve.send(sandesh=self._sandesh)
            device_fqnames.append(prouter_uve_name)

        return device_fqnames

    # end create_physical_router_job_uve

    def mark_failure(self,
                     msg,
                     job_template_fq_name,
                     job_execution_id,
                     fabric_fq_name,
                     mark_uve=True,
                     device_list=None,
                     fabric_job_uve_name=None,
                     job_params=None):
        self._logger.error("Marked job as failed %s %s %s " %
                           (job_template_fq_name, job_execution_id, msg))
        # send job object log for failure
        self._job_log_utils.send_job_log(job_template_fq_name,
                                         job_execution_id, fabric_fq_name, msg,
                                         JobStatus.FAILURE.value)
        # update the in memory job status for the job
        self.job_status[job_execution_id] = JobStatus.FAILURE.value
        self.publish_job_status_notification(job_execution_id,
                                             JobStatus.FAILURE.value)

        # update the UVE
        if mark_uve:
            if fabric_fq_name != "__DEFAULT__" and not device_list:
                self.create_fabric_job_uve(fabric_job_uve_name,
                                           job_execution_id,
                                           JobStatus.FAILURE.value, 100.0)
            if device_list:
                self.create_physical_router_job_uve(device_list, job_params,
                                                    fabric_job_uve_name,
                                                    JobStatus.FAILURE.value,
                                                    100.0)

    # end mark_failure

    def _load_job_log(self, marker, input_str):
        json_str = input_str.split(marker)[1]
        try:
            return json.loads(json_str)
        except ValueError:
            return ast.literal_eval(json_str)

    # end _load_job_log

    def _extracted_file_output(self, execution_id):
        status = "FAILURE"
        prouter_info = {}
        device_op_results = {}
        failed_devices_list = []
        try:
            with open("/tmp/" + execution_id, "r") as f_read:
                for line in f_read:
                    if 'PROUTER_LOG##' in line:
                        job_log = self._load_job_log('PROUTER_LOG##', line)
                        fqname = ":".join(job_log.get('prouter_fqname'))
                        prouter_info[fqname] = job_log.get('onboarding_state')
                    if line.startswith('job_summary'):
                        job_log = self._load_job_log('JOB_LOG##', line)
                        status = job_log.get('job_status')
                        failed_devices_list = job_log.get(
                            'failed_devices_list')
                    if 'GENERIC_DEVICE##' in line:
                        job_log = self._load_job_log('GENERIC_DEVICE##', line)
                        device_name = job_log.get('device_name')
                        device_op_results[device_name] = job_log.get(
                            'command_output')
        except Exception as e:
            msg = "File corresponding to execution id %s not found: %s\n%s" % (
                execution_id, str(e), traceback.format_exc())
            self._logger.error(msg)

        return status, prouter_info, device_op_results, failed_devices_list

    # end _extracted_file_output

    def job_mgr_signal_handler(self, signalnum, frame):
        pid = None
        signal_var = None
        try:
            # get the child process id that called the signal handler
            pid = os.waitpid(-1, os.WNOHANG)
            signal_var = self._job_mgr_running_instances.get(str(pid[0]))
            if not signal_var:
                self._logger.error(
                    "Job mgr process %s not found in the instance "
                    "map" % str(pid))
                return

            msg = "Entered job_mgr_signal_handler for: %s" % signal_var
            self._logger.notice(msg)
            exec_id = signal_var.get('exec_id')

            status, prouter_info, device_op_results, failed_devices_list = \
                self._extracted_file_output(exec_id)
            self.job_status[exec_id] = status
            self.publish_job_status_notification(exec_id, status)

            if signal_var.get('fabric_name') != "__DEFAULT__"\
                    and not signal_var.get('device_fqnames'):
                job_execution_data = FabricJobExecution(
                    name=signal_var.get('fabric_name'),
                    job_status=status,
                    percentage_completed=100)
                job_execution_uve = FabricJobUve(data=job_execution_data,
                                                 sandesh=self._sandesh)
                job_execution_uve.send(sandesh=self._sandesh)
            else:
                for prouter_uve_name in signal_var.get('device_fqnames'):
                    prouter_status = status
                    device_name = prouter_uve_name.split(":")[1]
                    if device_name in failed_devices_list:
                        prouter_status = "FAILURE"
                    prouter_job_data = PhysicalRouterJobExecution(
                        name=prouter_uve_name,
                        job_status=prouter_status,
                        percentage_completed=100,
                        device_op_results=json.dumps(
                            device_op_results.get(device_name, {})))
                    prouter_job_uve = PhysicalRouterJobUve(
                        data=prouter_job_data, sandesh=self._sandesh)
                    prouter_job_uve.send(sandesh=self._sandesh)

            for k, v in prouter_info.iteritems():
                prouter_uve_name = "%s:%s" % (k, signal_var.get('fabric_name'))
                prouter_job_data = PhysicalRouterJobExecution(
                    name=prouter_uve_name,
                    execution_id=exec_id,
                    job_start_ts=int(round(
                        signal_var.get('start_time') * 1000)),
                    prouter_state=v)
                prouter_job_uve = PhysicalRouterJobUve(data=prouter_job_data,
                                                       sandesh=self._sandesh)
                prouter_job_uve.send(sandesh=self._sandesh)

            self._clean_up_job_data(signal_var, str(pid[0]))

            self._logger.info(
                "Job : %s finished. Current number of job_mgr "
                "processes running now %s " %
                (signal_var,
                 self._job_mgr_statistics['number_processess_running']))

        except OSError as process_error:
            self._logger.error("Could not retrieve the child process id. "
                               "OS call returned with error %s" %
                               str(process_error))
        except Exception as unknown_exception:
            self._clean_up_job_data(signal_var, str(pid[0]))
            self._logger.error("Failed in job signal handler %s" %
                               str(unknown_exception))

    # end job_mgr_signal_handler

    def _clean_up_job_data(self, signal_var, pid):
        # remove the pid entry of the processed job_mgr process
        del self._job_mgr_running_instances[pid]

        # clean up fabric level lock
        if signal_var.get('job_concurrency') \
                is not None and signal_var.get('job_concurrency') == "fabric":
            self._release_fabric_job_lock(signal_var.get('fabric_fq_name'))
        self._cleanup_job_lock(signal_var.get('fabric_fq_name'))

        self._job_mgr_statistics['number_processess_running'] = len(
            self._job_mgr_running_instances)

    # end _clean_up_job_data

    def _is_existing_job_for_fabric(self, fabric_fq_name, job_execution_id):
        is_fabric_job_running = False
        # build the zk lock path
        fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \
                           self.FABRIC_ZK_LOCK
        # check if the lock is already taken if not taken, acquire the lock
        # by creating a node
        try:
            self._zookeeper_client.create_node(fabric_node_path,
                                               value=job_execution_id,
                                               ephemeral=True)
            self._logger.info("Acquired fabric lock"
                              " for %s " % fabric_node_path)
        except ResourceExistsError:
            # means the lock was acquired by some other job
            value = self._zookeeper_client.read_node(fabric_node_path)
            self._logger.error("Fabric lock is already acquired by"
                               " job %s " % value)
            is_fabric_job_running = True
        return is_fabric_job_running

    # end _is_existing_job_for_fabric

    def _release_fabric_job_lock(self, fabric_fq_name):
        # build the zk lock path
        fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \
                           self.FABRIC_ZK_LOCK
        try:
            self._zookeeper_client.delete_node(fabric_node_path)
            self._logger.info("Released fabric lock"
                              " for %s " % fabric_node_path)
        except Exception as zk_error:
            self._logger.error("Exception while releasing the zookeeper lock"
                               " %s " % repr(zk_error))

    # end _release_fabric_job_lock

    def _cleanup_job_lock(self, fabric_fq_name):
        fabric_node_path = '/job-manager/' + fabric_fq_name
        try:
            if not self._zookeeper_client.get_children(fabric_node_path):
                self._zookeeper_client.delete_node(fabric_node_path)
                self._logger.info("Released fabric node"
                                  " for %s " % fabric_node_path)
        except Exception as zk_error:
            self._logger.error("Exception while releasing the fabric node for "
                               "%s: %s " % (fabric_node_path, str(zk_error)))

    # end _cleanup_job_lock

    def save_abstract_config(self, job_params):
        # Saving device abstract config to a local file as it could be large
        # config. There will be one local file per device and this file gets
        # removed when device is removed from database.
        dev_abs_cfg = job_params.get('input', {}).get('device_abstract_config')
        if dev_abs_cfg:
            dev_mgt_ip = dev_abs_cfg.get('system', {}).get('management_ip')
            if not dev_mgt_ip:
                raise ValueError('Missing management IP in abstract config')

            dev_cfg_dir = '/opt/contrail/fabric_ansible_playbooks/config/' +\
                          dev_mgt_ip
            if not os.path.exists(dev_cfg_dir):
                os.makedirs(dev_cfg_dir)
            if dev_cfg_dir:
                with open(dev_cfg_dir + '/abstract_cfg.json', 'w') as f:
                    f.write(json.dumps(dev_abs_cfg, indent=4))
                job_params.get('input').pop('device_abstract_config')

    # end save_abstract_config

    def get_job_concurrency(self, job_template_id, job_exec_id):

        (ok, result) = self.db_read("job-template", job_template_id,
                                    ['job_template_concurrency_level'])
        if not ok:
            msg = "Error while reading the job concurrency " \
                  "from the job template with id %s : %s" %\
                  (job_template_id, result)
            raise JobException(msg, job_exec_id)
        return result.get('job_template_concurrency_level')

    # end get_job_concurrency

    def read_device_data(self,
                         device_list,
                         request_params,
                         job_exec_id,
                         is_delete=False):
        device_data = dict()

        for device_id in device_list:
            if not is_delete:
                try:
                    (ok, result) = self.db_read("physical-router", device_id, [
                        'physical_router_user_credentials',
                        'physical_router_management_ip', 'fq_name',
                        'physical_router_device_family',
                        'physical_router_vendor_name',
                        'physical_router_product_name', 'fabric_refs'
                    ])
                    if not ok:
                        msg = "Error while reading the physical router " \
                              "with id %s : %s" % (device_id, result)
                        raise JobException(msg, job_exec_id)
                except NoIdError as ex:
                    msg = "Device not found" \
                          "%s: %s" % (device_id, str(ex))
                    raise JobException(msg, job_exec_id)
                except Exception as e:
                    msg = "Exception while reading device %s %s " % \
                          (device_id, str(e))
                    raise JobException(msg, job_exec_id)

                device_fq_name = result.get('fq_name')
                device_mgmt_ip = result.get('physical_router_management_ip')
                user_cred = result.get('physical_router_user_credentials')

                device_family = result.get("physical_router_device_family")
                device_vendor_name = result.get("physical_router_vendor_name")
                device_product_name = result.get(
                    "physical_router_product_name")

                fabric_refs = result.get('fabric_refs')
                if fabric_refs:
                    fabric_fq_name = result.get('fabric_refs')[0].get('to')
                    fabric_fq_name_str = ':'.join(fabric_fq_name)
                    request_params['fabric_fq_name'] = fabric_fq_name_str
            else:
                device_mgmt_ip = request_params.get(
                    'input', {}).get('device_management_ip')
                device_abs_cfg = request_params.get(
                    'input', {}).get('device_abstract_config')

                system = device_abs_cfg.get('system', {})
                device_name = system.get('name')
                device_username = system.get('credentials',
                                             {}).get('user_name')
                device_password = system.get('credentials', {}).get('password')
                user_cred = {
                    "username": device_username,
                    "password": device_password
                }
                device_family = system.get('device_family')
                device_vendor_name = system.get('vendor_name')
                device_product_name = system.get('product_name')
                device_fq_name = ["default-global-system-config", device_name]
                self.read_fabric_data(request_params, job_exec_id, is_delete)

            device_json = {"device_management_ip": device_mgmt_ip}
            device_json.update({"device_fqname": device_fq_name})

            if user_cred:
                device_json.update(
                    {"device_username": user_cred.get('username')})
                device_json.update(
                    {"device_password": user_cred.get('password')})
            if device_family:
                device_json.update({"device_family": device_family})

            if device_vendor_name:
                device_json.update({"device_vendor": device_vendor_name})

            if device_product_name:
                device_json.update({"device_product": device_product_name})

            device_data.update({device_id: device_json})

        if len(device_data) > 0:
            request_params.update({"device_json": device_data})

    # end read_device_data

    def read_fabric_data(self,
                         request_params,
                         job_execution_id,
                         is_delete=False):
        if request_params.get('input') is None:
            err_msg = "Missing job input"
            raise JobException(err_msg, job_execution_id)
        fabric_fq_name = None
        if request_params.get('input').get('fabric_fq_name'):
            fabric_fq_name = request_params.get('input').get('fabric_fq_name')
        elif request_params.get('input').get('fabric_uuid'):
            # get the fabric fq_name from the db if fabric_uuid is provided
            fabric_uuid = request_params.get('input').get('fabric_uuid')
            try:
                fabric_fq_name = self._db_conn.uuid_to_fq_name(fabric_uuid)
            except NoIdError as e:
                raise JobException(str(e), job_execution_id)
        else:
            if "device_deletion_template" in request_params.get(
                    'job_template_fq_name'):
                fabric_fq_name = ["__DEFAULT__"]
            elif not is_delete:
                err_msg = "Missing fabric details in the job input"
                raise JobException(err_msg, job_execution_id)
        if fabric_fq_name:
            fabric_fq_name_str = ':'.join(map(str, fabric_fq_name))
            request_params['fabric_fq_name'] = fabric_fq_name_str
コード例 #5
0
class DeviceJobManager(object):

    JOB_REQUEST_EXCHANGE = "job_request_exchange"
    JOB_REQUEST_CONSUMER = "job_request_consumer"
    JOB_REQUEST_ROUTING_KEY = "job.request"
    JOB_STATUS_EXCHANGE = "job_status_exchange"
    JOB_STATUS_ROUTING_KEY = "job.status."
    JOB_STATUS_TTL = 5*60
    FABRIC_ZK_LOCK = "fabric-job-monitor"

    _instance = None

    def __init__(self, db_conn, amqp_client, zookeeper_client, args,
                 dm_logger):
        DeviceJobManager._instance = self
        self._db_conn = db_conn
        self._amqp_client = amqp_client
        self._zookeeper_client = zookeeper_client
        self._args = args
        self._job_mgr_statistics = {
            'max_job_count': self._args.max_job_count,
            'running_job_count': 0
        }
        # dict of exec_id:job_status (key/value pairs)
        self.job_status = {}
        # map of running job instances. Key is the pid and value is job
        # instance info
        self._job_mgr_running_instances = {}

        job_args = {
            'collectors': self._args.collectors,
            'fabric_ansible_conf_file': self._args.fabric_ansible_conf_file,
            'host_ip': self._args.host_ip,
            'zk_server_ip': self._args.zk_server_ip,
            'cluster_id': self._args.cluster_id,
            'cassandra_user': self._args.cassandra_user,
            'cassandra_password': self._args.cassandra_password,
            'cassandra_server_list': self._args.cassandra_server_list,
            'cassandra_use_ssl': self._args.cassandra_use_ssl,
            'cassandra_ca_certs': self._args.cassandra_ca_certs
        }
        self._job_args = json.dumps(job_args)

        # initialize the job logger
        self._job_log_utils = JobLogUtils(
            sandesh_instance_id="DeviceJobManager" + str(time.time()),
            config_args=self._job_args,
            sandesh_instance=dm_logger._sandesh)
        self._logger = self._job_log_utils.config_logger
        self._sandesh = self._logger._sandesh

        self._amqp_client.add_exchange(self.JOB_REQUEST_EXCHANGE,
                                       type='direct')
        self._amqp_client.add_consumer(
            self.JOB_REQUEST_CONSUMER,
            self.JOB_REQUEST_EXCHANGE,
            routing_key=self.JOB_REQUEST_ROUTING_KEY,
            callback=self.handle_execute_job_request)
        self._amqp_client.add_exchange(self.JOB_STATUS_EXCHANGE, type='direct')
    # end __init__

    @classmethod
    def get_instance(cls):
        return cls._instance
    # end get_instance

    @classmethod
    def destroy_instance(cls):
        inst = cls.get_instance()
        if not inst:
            return
        cls._instance = None
    # end destroy_instance

    def db_read(self, obj_type, obj_id, obj_fields=None,
                ret_readonly=False):
        try:
            (ok, cassandra_result) = self._db_conn.object_read(
                obj_type, [obj_id], obj_fields, ret_readonly=ret_readonly)
        except NoIdError as e:
            # if NoIdError is for obj itself (as opposed to say for parent
            # or ref), let caller decide if this can be handled gracefully
            # by re-raising
            if e._unknown_id == obj_id:
                raise

            return (False, str(e))

        return (ok, cassandra_result[0])
    # end db_read

    def is_max_job_threshold_reached(self):
        if self._job_mgr_statistics.get('running_job_count') < \
                    self._job_mgr_statistics.get('max_job_count'):
            return False
        return True
    # end is_max_job_threshold_reached

    def publish_job_status_notification(self, status, job_execution_id):
        try:
            msg = {"job_execution_id": job_execution_id,
                   "job_status": status
                   }
            self._amqp_client.publish(
                msg, self.JOB_STATUS_EXCHANGE,
                routing_key=self.JOB_STATUS_ROUTING_KEY + job_execution_id,
                serializer='json', retry=True,
                retry_policy={'max_retries': 5,
                              'interval_start': 2,
                              'interval_step': 3,
                              'interval_max': 15},
                expiration=self.JOB_STATUS_TTL)
        except Exception as e:
            self._logger.error("Failed to send job status change notification"
                               " %s %s" % (job_execution_id, status))
    # end publish_job_status_notification

    def get_job_template_id(self, job_template_fq_name):
        try:
            return self._db_conn.fq_name_to_uuid("job_template",
                                                 job_template_fq_name)
        except Exception as e:
            msg = "Error while reading job_template_id: " + str(e)
            self._logger.error(msg)
            raise
    # end get_job_template_id

    def handle_execute_job_request(self, body, message):
        job_input_params = None
        # check if the max job processing threshold is reached
        if not self.is_max_job_threshold_reached():
            job_input_params = body
            message.ack()
        else:
            # requeue the message if the max threshold is reached, to be picked
            # by another job manager or wait until the current job mgr is free
            message.reject(requeue=True)

        update_uve_on_failure = False
        device_list = None
        extra_params = job_input_params.get('params')
        if extra_params is not None:
            device_list = extra_params.get('device_list')

        is_delete = job_input_params.get('input').get('is_delete')
        job_execution_id = job_input_params.get('job_execution_id')
        job_template_fq_name = job_input_params.get('job_template_fq_name')
        job_template_id = job_input_params.get('job_template_id')
        fabric_fq_name = None
        fabric_job_uve_name = ''

        try:

            # populate job template id if not present in input_param
            if job_template_id is None:
                job_template_id = self.get_job_template_id(
                    job_template_fq_name)
                job_input_params["job_template_id"] = job_template_id

            if is_delete is None or is_delete is False:
                # read the device object and pass the necessary data to the job
                if device_list:
                    self.read_device_data(device_list, job_input_params,
                                          job_execution_id)
                else:
                    self.read_fabric_data(job_input_params, job_execution_id)

            # read the job concurrency level from job template
            job_concurrency = self.get_job_concurrency(job_template_id,
                                                       job_execution_id)
            job_input_params['job_concurrency'] = job_concurrency

            fabric_fq_name = job_input_params.get('fabric_fq_name')
            fabric_job_uve_name_list = job_template_fq_name
            fabric_job_uve_name_list.insert(0, fabric_fq_name)
            fabric_job_uve_name = ':'.join(map(str, fabric_job_uve_name_list))

            device_fqnames = []
            # skip UVE creations and device reads from the database since
            # the device object will be deleted
            if is_delete is None or not is_delete:

                # create the UVE
                if fabric_fq_name is not "__DEFAULT__" and not device_list:
                    self.create_fabric_job_uve(fabric_job_uve_name,
                                               JobStatus.STARTING.value, 0.0)
                if device_list:
                    device_fqnames = self.create_physical_router_job_uve(
                        device_list, job_input_params, fabric_job_uve_name,
                        JobStatus.STARTING.value, 0.0)

                # after creating the UVE, flag indicates to update the
                # UVE upon any failures
                update_uve_on_failure = True

                # check if there is any other job running for the fabric
                if job_concurrency is not None and job_concurrency == "fabric":
                    existing_job = self._is_existing_job_for_fabric(
                        fabric_fq_name, job_execution_id)
                    if existing_job:
                        msg = "Another job for the same fabric is in" \
                              " progress. Please wait for the job to finish"
                        self.mark_failure(
                            msg, job_template_fq_name,
                            job_execution_id,
                            fabric_fq_name, mark_uve=True,
                            device_list=device_list,
                            fabric_job_uve_name=fabric_job_uve_name,
                            job_params=job_input_params)
                        return

            start_time = time.time()
            signal_var = {
                'fabric_name': fabric_job_uve_name,
                'fabric_fq_name': fabric_fq_name,
                'start_time': start_time,
                'exec_id': job_execution_id,
                'device_fqnames': device_fqnames,
                'job_concurrency': job_concurrency
            }

            self.job_status.update(
                {job_execution_id: JobStatus.STARTING.value})
            self.publish_job_status_notification(job_execution_id,
                                                 JobStatus.STARTING.value)

            # handle process exit signal
            signal.signal(signal.SIGCHLD,  self.job_mgr_signal_handler)

            # write the abstract config to file if needed
            self.save_abstract_config(job_input_params)

            # add params needed for sandesh connection
            job_input_params['args'] = self._job_args

            # create job manager subprocess
            job_mgr_path = os.path.dirname(
                __file__) + "/../job_manager/job_mgr.py"
            job_process = subprocess.Popen(["python", job_mgr_path, "-i",
                                           json.dumps(job_input_params)],
                                           cwd="/", close_fds=True)

            self._job_mgr_running_instances[str(job_process.pid)] = signal_var

            self._job_mgr_statistics['running_job_count'] = len(
                self._job_mgr_running_instances)

            self._logger.notice("Created job manager process. Execution id: "
                                "%s" % job_execution_id)

            self._logger.info(
                "Current number of job_mgr processes running %s"
                % self._job_mgr_statistics.get('running_job_count'))
        except Exception as e:
            msg = "Exception while processing the job request %s %s %s : " \
                  "%s %s" % (job_template_fq_name, job_execution_id,
                             fabric_fq_name, repr(e), traceback.format_exc())
            self.mark_failure(msg, job_template_fq_name, job_execution_id,
                              fabric_fq_name, mark_uve=update_uve_on_failure,
                              device_list=device_list,
                              fabric_job_uve_name=fabric_job_uve_name,
                              job_params=job_input_params)
    # end handle_execute_job_request

    def create_fabric_job_uve(self, fabric_job_uve_name, job_status,
                              percentage_completed):
        job_execution_data = FabricJobExecution(
            name=fabric_job_uve_name,
            execution_id=fabric_job_uve_name,
            job_start_ts=int(round(time.time() * 1000)),
            job_status=job_status,
            percentage_completed=percentage_completed
        )
        job_execution_uve = FabricJobUve(data=job_execution_data,
                                         sandesh=self._sandesh)
        job_execution_uve.send(sandesh=self._sandesh)
    # end create_fabric_job_uve

    def create_physical_router_job_uve(self, device_list, job_input_params,
                                       fabric_job_uve_name, job_status,
                                       percentage_completed):
        device_fqnames = []
        for device_id in device_list:
            device_fqname = job_input_params.get(
                'device_json').get(device_id).get('device_fqname')
            device_fqname = ':'.join(map(str, device_fqname))
            prouter_uve_name = device_fqname + ":" + \
                fabric_job_uve_name

            prouter_job_data = PhysicalRouterJobExecution(
                name=prouter_uve_name,
                execution_id=job_input_params.get('job_execution_id'),
                job_start_ts=int(round(time.time() * 1000)),
                job_status=job_status,
                percentage_completed=percentage_completed
            )

            prouter_job_uve = PhysicalRouterJobUve(
                data=prouter_job_data, sandesh=self._sandesh)
            prouter_job_uve.send(sandesh=self._sandesh)
            device_fqnames.append(prouter_uve_name)

        return device_fqnames
    # end create_physical_router_job_uve

    def mark_failure(self, msg, job_template_fq_name, job_execution_id,
                     fabric_fq_name, mark_uve=True, device_list=None,
                     fabric_job_uve_name=None, job_params=None):
        self._logger.error("Marked job as failed %s %s %s " %
                           (job_template_fq_name, job_execution_id, msg))
        # send job object log for failure
        self._job_log_utils.send_job_log(job_template_fq_name,
                                         job_execution_id, fabric_fq_name, msg,
                                         JobStatus.FAILURE.value)
        # update the in memory job status for the job
        self.job_status[job_execution_id] = JobStatus.FAILURE.value
        self.publish_job_status_notification(job_execution_id,
                                             JobStatus.FAILURE.value)

        # update the UVE
        if mark_uve:
            if fabric_fq_name is not "__DEFAULT__" and not device_list:
                self.create_fabric_job_uve(fabric_job_uve_name,
                                           JobStatus.FAILURE.value, 100.0)
            if device_list:
                self.create_physical_router_job_uve(device_list,
                                                    job_params,
                                                    fabric_job_uve_name,
                                                    JobStatus.FAILURE.value,
                                                    100.0)
    # end mark_failure

    def _load_job_log(self, marker, input_str):
        json_str = input_str.split(marker)[1]
        try:
            return json.loads(json_str)
        except ValueError:
            return ast.literal_eval(json_str)
    # end _load_job_log

    def _extracted_file_output(self, execution_id):
        status = "FAILURE"
        prouter_info = {}
        device_op_results = {}
        failed_devices_list = []
        try:
            with open("/tmp/"+execution_id, "r") as f_read:
                for line in f_read:
                    if 'PROUTER_LOG##' in line:
                        job_log = self._load_job_log('PROUTER_LOG##', line)
                        fqname = ":".join(job_log.get('prouter_fqname'))
                        prouter_info[fqname] = job_log.get('onboarding_state')
                    if line.startswith('job_summary'):
                        job_log = self._load_job_log('JOB_LOG##', line)
                        status = job_log.get('job_status')
                        failed_devices_list = job_log.get(
                            'failed_devices_list')
                    if 'GENERIC_DEVICE##' in line:
                        job_log = self._load_job_log(
                            'GENERIC_DEVICE##', line)
                        device_name = job_log.get('device_name')
                        device_op_results[device_name] = job_log.get(
                            'command_output')
        except Exception as e:
            msg = "File corresponding to execution id %s not found: %s\n%s" % (
                execution_id, str(e), traceback.format_exc())
            self._logger.error(msg)

        return status, prouter_info, device_op_results, failed_devices_list
    # end _extracted_file_output

    def job_mgr_signal_handler(self, signalnum, frame):
        pid = None
        signal_var = None
        try:
            # get the child process id that called the signal handler
            pid = os.waitpid(-1, os.WNOHANG)
            signal_var = self._job_mgr_running_instances.get(str(pid[0]))
            if not signal_var:
                self._logger.error(
                    "Job mgr process %s not found in the instance "
                    "map" % str(pid))
                return

            msg = "Entered job_mgr_signal_handler for: %s" % signal_var
            self._logger.notice(msg)
            exec_id = signal_var.get('exec_id')

            status, prouter_info, device_op_results,\
            failed_devices_list = \
                self._extracted_file_output(exec_id)
            self.job_status[exec_id] = status
            self.publish_job_status_notification(exec_id, status)

            if signal_var.get('fabric_name') is not \
                    "__DEFAULT__" and not signal_var.get('device_fqnames'):
                job_execution_data = FabricJobExecution(
                    name=signal_var.get('fabric_name'),
                    job_status=status,
                    percentage_completed=100)
                job_execution_uve = FabricJobUve(data=job_execution_data,
                                                 sandesh=self._sandesh)
                job_execution_uve.send(sandesh=self._sandesh)
            else:
                for prouter_uve_name in signal_var.get('device_fqnames'):
                    prouter_status = status
                    device_name = prouter_uve_name.split(":")[1]
                    if device_name in failed_devices_list:
                        prouter_status = "FAILURE"
                    prouter_job_data = PhysicalRouterJobExecution(
                        name=prouter_uve_name,
                        job_status=prouter_status,
                        percentage_completed=100,
                        device_op_results=json.dumps(
                            device_op_results.get(device_name, {}))
                    )
                    prouter_job_uve = PhysicalRouterJobUve(
                        data=prouter_job_data, sandesh=self._sandesh)
                    prouter_job_uve.send(sandesh=self._sandesh)

            for k, v in prouter_info.iteritems():
                prouter_uve_name = "%s:%s" % (k, signal_var.get('fabric_name'))
                prouter_job_data = PhysicalRouterJobExecution(
                    name=prouter_uve_name,
                    execution_id=exec_id,
                    job_start_ts=int(round(signal_var.get('start_time'
                                                          ) * 1000)),
                    prouter_state=v
                )
                prouter_job_uve = PhysicalRouterJobUve(
                    data=prouter_job_data, sandesh=self._sandesh)
                prouter_job_uve.send(sandesh=self._sandesh)

            self._clean_up_job_data(signal_var, str(pid[0]))

            self._logger.info("Job : %s finished. Current number of job_mgr "
                              "processes running now %s " %
                              (signal_var, self._job_mgr_statistics[
                                  'number_processess_running']))

        except OSError as process_error:
            self._logger.error("Could not retrieve the child process id. "
                               "OS call returned with error %s" %
                               str(process_error))
        except Exception as unknown_exception:
            self._clean_up_job_data(signal_var, str(pid[0]))
            self._logger.error("Failed in job signal handler %s" %
                               str(unknown_exception))
    # end job_mgr_signal_handler

    def _clean_up_job_data(self, signal_var, pid):
        # remove the pid entry of the processed job_mgr process
        del self._job_mgr_running_instances[pid]

        # clean up fabric level lock
        if signal_var.get('job_concurrency') \
                is not None and signal_var.get('job_concurrency') == "fabric":
            self._release_fabric_job_lock(signal_var.get('fabric_fq_name'))

        self._job_mgr_statistics['number_processess_running'] = len(
            self._job_mgr_running_instances)
    # end _clean_up_job_data

    def _is_existing_job_for_fabric(self, fabric_fq_name, job_execution_id):
        is_fabric_job_running = False
        # build the zk lock path
        fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \
                           self.FABRIC_ZK_LOCK
        # check if the lock is already taken if not taken, acquire the lock
        # by creating a node
        try:
            self._zookeeper_client.create_node(fabric_node_path,
                                               value=job_execution_id,
                                               ephemeral=True)
            self._logger.info("Acquired fabric lock"
                              " for %s " % fabric_node_path)
        except ResourceExistsError:
            # means the lock was acquired by some other job
            value = self._zookeeper_client.read_node(fabric_node_path)
            self._logger.error("Fabric lock is already acquired by"
                               " job %s " % value)
            is_fabric_job_running = True
        return is_fabric_job_running
    # end _is_existing_job_for_fabric

    def _release_fabric_job_lock(self, fabric_fq_name):
        # build the zk lock path
        fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \
                           self.FABRIC_ZK_LOCK
        try:
            self._zookeeper_client.delete_node(fabric_node_path)
            self._logger.info("Released fabric lock"
                              " for %s " % fabric_node_path)
        except Exception as zk_error:
            self._logger.error("Exception while releasing the zookeeper lock"
                               " %s " % repr(zk_error))
    # end _release_fabric_job_lock

    def save_abstract_config(self, job_params):
        """
        Saving device abstract config to a local file as it could be large
        config. There will be one local file per device and this file gets
        removed when device is removed from database.
        :param job_params: dict
        :return: None
        """
        dev_abs_cfg = job_params.get('input', {}).get('device_abstract_config')
        if dev_abs_cfg:
            dev_mgt_ip = dev_abs_cfg.get('system', {}).get('management_ip')
            if not dev_mgt_ip:
                raise ValueError('Missing management IP in abstract config')

            dev_cfg_dir = '/opt/contrail/fabric_ansible_playbooks/config/' +\
                          dev_mgt_ip
            if not os.path.exists(dev_cfg_dir):
                os.makedirs(dev_cfg_dir)
            if dev_cfg_dir:
                with open(dev_cfg_dir + '/abstract_cfg.json', 'w') as f:
                    f.write(json.dumps(dev_abs_cfg, indent=4))
                job_params.get('input').pop('device_abstract_config')
    # end save_abstract_config

    def get_job_concurrency(self, job_template_id, job_exec_id):

        (ok, result) = self.db_read("job-template", job_template_id,
                                    ['job_template_concurrency_level'])
        if not ok:
            msg = "Error while reading the job concurrency " \
                  "from the job template with id %s : %s" %\
                  (job_template_id, result)
            raise JobException(msg, job_exec_id)
        return result.get('job_template_concurrency_level')
    # end get_job_concurrency

    def read_device_data(self, device_list, request_params, job_exec_id):
        device_data = dict()
        for device_id in device_list:
            try:
                (ok, result) = self.db_read(
                    "physical-router", device_id,
                    ['physical_router_user_credentials',
                     'physical_router_management_ip', 'fq_name',
                     'physical_router_device_family',
                     'physical_router_vendor_name',
                     'physical_router_product_name',
                     'fabric_refs'])
                if not ok:
                    msg = "Error while reading the physical router " \
                          "with id %s : %s" % (device_id, result)
                    raise JobException(msg, job_exec_id)
            except Exception as e:
                msg = "Exception while reading device %s %s " % \
                      (device_id, str(e))
                raise JobException(msg, job_exec_id)

            device_json = {"device_management_ip": result.get(
                'physical_router_management_ip')}
            device_json.update({"device_fqname": result.get('fq_name')})
            user_cred = result.get('physical_router_user_credentials')
            if user_cred:
                device_json.update(
                    {"device_username": user_cred.get('username')})
                device_json.update({"device_password":
                                    user_cred.get('password')})
            device_family = result.get("physical_router_device_family")
            if device_family:
                device_json.update({"device_family": device_family})
            device_vendor_name = result.get("physical_router_vendor_name")
            if device_vendor_name:
                device_json.update({"device_vendor": device_vendor_name})
            device_product_name = result.get("physical_router_product_name")
            if device_product_name:
                device_json.update({"device_product": device_product_name})

            device_data.update({device_id: device_json})

            fabric_refs = result.get('fabric_refs')
            if fabric_refs and len(fabric_refs) > 0:
                fabric_fq_name = result.get('fabric_refs')[0].get('to')
                fabric_fq_name_str = ':'.join(map(str, fabric_fq_name))
                request_params['fabric_fq_name'] = fabric_fq_name_str

        if len(device_data) > 0:
            request_params.update({"device_json": device_data})
    # end read_device_data

    def read_fabric_data(self, request_params, job_execution_id):
        if request_params.get('input') is None:
            err_msg = "Missing job input"
            raise JobException(err_msg, job_execution_id)
        # get the fabric fq_name from the database if fabric_uuid is provided
        fabric_fq_name = None
        if request_params.get('input').get('fabric_uuid'):
            fabric_uuid = request_params.get('input').get('fabric_uuid')
            try:
                fabric_fq_name = self._db_conn.uuid_to_fq_name(fabric_uuid)
            except NoIdError as e:
                raise JobException(str(e), job_execution_id)
        elif request_params.get('input').get('fabric_fq_name'):
            fabric_fq_name = request_params.get('input').get('fabric_fq_name')
        else:
            if "device_deletion_template" in request_params.get(
                   'job_template_fq_name'):
                fabric_fq_name = "__DEFAULT__"
            else:
                err_msg = "Missing fabric details in the job input"
                raise JobException(err_msg, job_execution_id)
        if fabric_fq_name:
            fabric_fq_name_str = ':'.join(map(str, fabric_fq_name))
            request_params['fabric_fq_name'] = fabric_fq_name_str