Exemple #1
0
def install_python_gdeploy():
    attributes = {}
    # Install python-gdeploy on the node
    if NS.config.data['package_source_type'] == 'pip':
        name = "https://github.com/Tendrl/python-gdeploy/archive/master.tar.gz"
        attributes["name"] = name
        attributes["editable"] = "false"
        ansible_module_path = "packaging/language/pip.py"
    elif NS.config.data['package_source_type'] == 'rpm':
        name = "python-gdeploy"
        ansible_module_path = "packaging/os/yum.py"
        attributes["name"] = name
    else:
        raise FlowExecutionFailedError(
            "Failed to install python-gdeploy. Invalid package source type")

    try:
        runner = ansible_module_runner.AnsibleRunner(ansible_module_path,
                                                     **attributes)
    except ansible_module_runner.AnsibleModuleNotFound:
        # Backward compat ansible<=2.2
        runner = ansible_module_runner.AnsibleRunner(
            "core/" + ansible_module_path, **attributes)
    try:
        result, err = runner.run()
        if result.get('failed', None):
            raise FlowExecutionFailedError(
                "Failed to install python-gdeploy. %s" % result['msg'])
    except ansible_module_runner.AnsibleExecutableGenerationFailed:
        raise FlowExecutionFailedError("Failed to install python-gdeploy")
Exemple #2
0
def install_gdeploy():
    # Install gdeploy on the node
    ansible_module_path = "packaging/os/yum.py"
    attributes = dict()
    attributes["name"] = "gdeploy"
    try:
        runner = ansible_module_runner.AnsibleRunner(
            ansible_module_path,
            **attributes
        )
    except ansible_module_runner.AnsibleModuleNotFound:
        # Backward compat ansible<=2.2
        runner = ansible_module_runner.AnsibleRunner(
            "core/" + ansible_module_path,
            **attributes
        )
    try:
        result, err = runner.run()
        if result.get('failed', None):
            raise FlowExecutionFailedError(
                "Failed to install gdeploy. %s" % result['msg']
            )
    except ansible_module_runner.AnsibleExecutableGenerationFailed:
        raise FlowExecutionFailedError(
            "Failed to install gdeploy"
        )
Exemple #3
0
def wait_for_task(task_id):
    count = 0
    plugin = NS.ceph_provisioner.get_plugin()
    resp = {}
    while count < 90:
        gevent.sleep(10)
        resp = plugin.task_status(task_id)
        if resp:
            if resp["ended"]:
                if resp["succeeded"]:
                    return
                else:
                    stderr = resp.get(
                        "stderr", "ceph-installer task_id %s "
                        "failed and did not complete" % task_id)
                    stdout = resp.get("stdout", "")
                    raise FlowExecutionFailedError(
                        dict(ceph_installer_task_id=task_id,
                             ceph_installer_task_stdout=stdout,
                             ceph_installer_task_stderr=stderr))
        count = count + 1
    stderr = resp.get(
        "stderr", "ceph-installer task_id %s timed out and did "
        "not complete" % task_id)
    stdout = resp.get("stdout", "")
    raise FlowExecutionFailedError(
        dict(ceph_installer_task_id=task_id,
             ceph_installer_task_stdout=stdout,
             ceph_installer_task_stderr=stderr))
Exemple #4
0
def acquire_node_lock(parameters):
    # check node_id is present
    for node in parameters['Node[]']:
        if not NS.tendrl.objects.NodeContext(node_id=node).exists():
            raise FlowExecutionFailedError("Unknown Node %s, cannot lock" %
                                           node)
    # check job is parent or child
    job = NS.tendrl.objects.Job(job_id=parameters['job_id']).load()
    p_job_id = None
    if "parent" in job.payload:
        p_job_id = job.payload['parent']

    for node_id in parameters['Node[]']:
        nc = NS.tendrl.objects.NodeContext(node_id=node_id).load()
        try:
            lock_owner_job = nc.locked_by
            # If the parent job has aquired lock on participating nodes,
            # dont you worry child job :)
            if p_job_id is not None and lock_owner_job is not None:
                if p_job_id == lock_owner_job:
                    continue
                else:
                    # if the locker owner job is already finished or
                    # failed, we should allow other flows to
                    # acquire the lock.
                    job = NS.tendrl.objects.Job(job_id=lock_owner_job).load()
                    if job and job.status in ["finished", "failed"]:
                        continue
                    else:
                        raise FlowExecutionFailedError(
                            "Cannot proceed further, "
                            "Node (%s) is already locked "
                            "by Job (%s)" % (node_id, lock_owner_job))
        except EtcdKeyNotFound:
            # To check what are all the nodes are already locked
            continue

    for node_id in parameters['Node[]']:
        nc = NS.tendrl.objects.NodeContext(node_id=node_id).load()
        lock_owner_job = nc.locked_by
        if p_job_id is not None and lock_owner_job is not None and \
            p_job_id == lock_owner_job:
            continue
        else:
            lock_owner_job = str(parameters["job_id"])
            nc.locked_by = lock_owner_job
            nc.save()
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Acquired lock (%s) on (%s)" %
                           (lock_owner_job, node_id)
                       },
                       job_id=parameters['job_id'],
                       flow_id=parameters['flow_id'])
 def run(self):
     super(SetupClusterAlias, self).run()
     integration_id = self.parameters["TendrlContext.integration_id"]
     short_name = self.parameters.get("Cluster.short_name")
     alias_dir_path = "%snames" % graphite_utils.get_data_dir_path()
     if not os.path.exists(alias_dir_path):
         try:
             os.makedirs(str(alias_dir_path))
         except OSError as ex:
             raise FlowExecutionFailedError(
                 "Failed to create cluster alias dir: (%s)"
                 " .Error: (%s)" %
                 (str(alias_dir_path), ex)
             )
     if short_name in [None, ""]:
         short_name = integration_id
     os.symlink(
         "%s/clusters/%s" % (
             graphite_utils.get_data_dir_path(),
             integration_id
         ),
         "%s/%s" % (alias_dir_path, short_name)
     )
     # Assign permission for carbon user
     try:
         storage_dir_path = graphite_utils.get_graphite_path(
             "cache", "storage_dir"
         )
         graphite_utils.change_owner(
             storage_dir_path,
             "carbon",
             "carbon",
             recursive=True
         )
     except (KeyError, OSError, TypeError) as ex:
         raise FlowExecutionFailedError(
             "Unable to modify the ownership of %s" % storage_dir_path
         )
     logger.log(
         "debug",
         NS.publisher_id,
         {
             "message": "Link %s -> %s created" %
             (
                 "%s/%s" % (alias_dir_path, short_name),
                 "%s/clusters/%s" % (
                     graphite_utils.get_data_dir_path(),
                     integration_id
                 )
             )
         },
         job_id=self.parameters['job_id'],
         flow_id=self.parameters['flow_id']
     )
     return True
Exemple #6
0
def expand_gluster(parameters):
    node_ips = get_node_ips(parameters)
    plugin = NS.gluster_provisioner.get_plugin()

    Event(
        Message(job_id=parameters['job_id'],
                flow_id=parameters['flow_id'],
                priority="info",
                publisher=NS.publisher_id,
                payload={
                    "message":
                    "Setting up gluster nodes %s" %
                    parameters['TendrlContext.integration_id']
                }))

    ret_val = plugin.setup_gluster_node(node_ips,
                                        repo=NS.config.data.get(
                                            'glusterfs_repo', None))
    if ret_val is not True:
        raise FlowExecutionFailedError("Error setting up gluster node")

    Event(
        Message(job_id=parameters['job_id'],
                flow_id=parameters['flow_id'],
                priority="info",
                publisher=NS.publisher_id,
                payload={
                    "message":
                    "Expanding gluster cluster %s" %
                    parameters['TendrlContext.integration_id']
                }))
    failed_nodes = []
    for node in node_ips:
        ret_val = plugin.expand_gluster_cluster(node)
        if not ret_val:
            failed_nodes.append(node)

    if failed_nodes:
        raise FlowExecutionFailedError(
            "Error expanding gluster cluster. Following nodes failed: %s" %
            ",".join(failed_nodes))

    Event(
        Message(job_id=parameters['job_id'],
                flow_id=parameters['flow_id'],
                priority="info",
                publisher=NS.publisher_id,
                payload={
                    "message":
                    "Expanded Gluster Cluster %s."
                    " New nodes are: %s" %
                    (parameters['TendrlContext.integration_id'],
                     ",".join(node_ips))
                }))
Exemple #7
0
def acquire_node_lock(parameters):
    # check node_id is present
    for node in parameters['Node[]']:
        try:
            NS._int.client.read("/nodes/%s" % node)
        except EtcdKeyNotFound:
            raise FlowExecutionFailedError(
                "Unknown Node %s, cannot lock" %
                node)
    # check job is parent or child
    job = Job(job_id=parameters['job_id']).load()
    p_job_id = None
    if "parent" in job.payload:
        p_job_id = job.payload['parent']

    for node in parameters['Node[]']:
        key = "/nodes/%s/locked_by" % node
        try:
            lock_owner_job = NS._int.client.read(key).value
            # If the parent job has aquired lock on participating nodes,
            # dont you worry child job :)
            if p_job_id == lock_owner_job:
                continue
            else:
                raise FlowExecutionFailedError("Cannot proceed further, "
                                               "Node (%s) is already locked "
                                               "by Job (%s)" % (node,
                                                                lock_owner_job)
                                               )
        except EtcdKeyNotFound:
            # To check what are all the nodes are already locked
            continue

    for node in parameters['Node[]']:
        try:
            lock_owner_job = NS._int.client.read(key).value
            if p_job_id == lock_owner_job:
                continue
        except EtcdKeyNotFound:
            lock_owner_job = str(parameters["job_id"])
            key = "nodes/%s/locked_by" % node
            NS._int.client.write(key, lock_owner_job)
            Event(
                Message(
                    job_id=parameters['job_id'],
                    flow_id=parameters['flow_id'],
                    priority="info",
                    publisher=NS.publisher_id,
                    payload={
                        "message": "Acquired lock (%s) for Node (%s)" % (
                            lock_owner_job, node)
                    }
                )
            )
Exemple #8
0
def gluster_create_ssh_setup_jobs(parameters, skip_current_node=False):
    node_list = copy.deepcopy(parameters['Node[]'])

    ssh_job_ids = []
    ssh_key, err = NS.gluster_provisioner.get_plugin().setup()
    if err != "":
        _msg = "Error generating ssh key on node %s" % NS.node_context.node_id
        logger.log("error",
                   NS.publisher_id, {"message": _msg},
                   job_id=parameters['job_id'],
                   flow_id=parameters['flow_id'])
        raise FlowExecutionFailedError(_msg)

    if not skip_current_node:
        ret_val, err = authorize_key.AuthorizeKey(ssh_key).run()
        if ret_val is not True or err != "":
            _msg = "Error adding authorized key for node %s" % \
                   NS.node_context.node_id
            logger.log("error",
                       NS.publisher_id, {"message": _msg},
                       job_id=parameters['job_id'],
                       flow_id=parameters['flow_id'])
            raise FlowExecutionFailedError(_msg)
        node_list.remove(NS.node_context.node_id)

    for node in node_list:
        if node == NS.node_context.node_id:
            continue
        new_params = parameters.copy()
        new_params['Node[]'] = [node]
        new_params['ssh_key'] = ssh_key
        # Create same flow for each node from list except this one
        payload = {
            "tags": ["tendrl/node_%s" % node],
            "run": "tendrl.flows.AuthorizeSshKey",
            "status": "new",
            "parameters": new_params,
            "parent": parameters['job_id'],
            "type": "node"
        }
        _job_id = str(uuid.uuid4())
        NS.tendrl.objects.Job(job_id=_job_id, status="new",
                              payload=payload).save()
        ssh_job_ids.append(_job_id)
        logger.log("info",
                   NS.publisher_id, {
                       "message":
                       "Created SSH setup job(jobID: %s) for %s" %
                       (_job_id, node)
                   },
                   job_id=parameters['job_id'],
                   flow_id=parameters['flow_id'])
    return ssh_job_ids
Exemple #9
0
    def run(self):
        integration_id = self.parameters.get("TendrlContext.integration_id")

        # Delete the cluster related alert dashboards
        grafana_utils.delete_panel(integration_id)

        # Archive the carbon data for the cluster
        archive_base_path = "%s/clusters" % (
            NS.config.data.get(
                "graphite_archive_path",
                "/usr/share/tendrl/graphite/archive"
            )
        )
        if not os.path.exists(archive_base_path):
            try:
                os.makedirs(str(archive_base_path))
            except OSError as ex:
                raise FlowExecutionFailedError(
                    "Failed to create archive dir: (%s)"
                    "for monitoring data. Error: (%s)" %
                    (str(archive_base_path), ex)
                )
        archive_path = "%s/%s_%s" % (
            archive_base_path,
            integration_id,
            str(datetime.datetime.now().isoformat())
        )
        resource_path = "%s/clusters/%s" % \
            (
                graphite_utils.get_data_dir_path(),
                integration_id
            )
        try:
            shutil.move(resource_path, archive_path)
        except Exception as ex:
            raise FlowExecutionFailedError(
                "Failed to archive the monitoring data. Error: (%s)" %
                ex
            )

        # Log an event mentioning the archive data location
        logger.log(
            "debug",
            NS.publisher_id,
            {
                "message": "Cluster %s moved to un-managed state.\n"
                "The archived monitoring data available at: %s" %
                (integration_id, archive_path)
            }
        )

        return True
def create_gluster(parameters):
    node_ips = get_node_ips(parameters)
    plugin = NS.gluster_provisioner.get_plugin()

    Event(
        Message(
            job_id=parameters['job_id'],
            flow_id=parameters['flow_id'],
            priority="info",
            publisher=NS.publisher_id,
            payload={"message": "Setting up gluster nodes %s" %
                                parameters['TendrlContext.integration_id']
                     }
        )
    )

    ret_val = plugin.setup_gluster_node(
        node_ips,
        repo=NS.config.data.get('glusterfs_repo', None)
    )
    if ret_val is not True:
        raise FlowExecutionFailedError("Error setting up gluster node")

    Event(
        Message(
            job_id=parameters['job_id'],
            flow_id=parameters['flow_id'],
            priority="info",
            publisher=NS.publisher_id,
            payload={"message": "Creating gluster cluster %s" %
                                parameters['TendrlContext.integration_id']
                     }
        )
    )
    ret_val = plugin.create_gluster_cluster(node_ips)
    if ret_val is not True:
        raise FlowExecutionFailedError("Error creating gluster cluster")

    Event(
        Message(
            job_id=parameters['job_id'],
            flow_id=parameters['flow_id'],
            priority="info",
            publisher=NS.publisher_id,
            payload={"message": "Created Gluster Cluster %s" %
                                parameters['TendrlContext.integration_id']
                     }
        )
    )
 def run(self):
     super(SetupClusterAlias, self).run()
     integration_id = self.parameters["TendrlContext.integration_id"]
     short_name = self.parameters.get("Cluster.short_name")
     alias_dir_path = "%snames" % graphite_utils.get_data_dir_path()
     if not os.path.exists(alias_dir_path):
         try:
             os.makedirs(str(alias_dir_path))
         except OSError as ex:
             raise FlowExecutionFailedError(
                 "Failed to create cluster alias dir: (%s)"
                 " .Error: (%s)" % (str(alias_dir_path), ex))
     if short_name in [None, ""]:
         short_name = integration_id
     os.symlink(
         "%s/clusters/%s" %
         (graphite_utils.get_data_dir_path(), integration_id),
         "%s/%s" % (alias_dir_path, short_name))
     logger.log(
         "debug",
         NS.publisher_id, {
             "message":
             "Link %s -> %s created" %
             ("%s/%s" % (alias_dir_path, short_name), "%s/clusters/%s" %
              (graphite_utils.get_data_dir_path(), integration_id))
         },
         job_id=self.parameters['job_id'],
         flow_id=self.parameters['flow_id'])
     return True
Exemple #12
0
 def run(self):
     _cluster = NS.tendrl.objects.Cluster(
         integration_id=NS.tendrl_context.integration_id).load()
     if _cluster.is_managed != "yes":
         raise FlowExecutionFailedError('Cluster is not managed')
     self.parameters['Service.name'] = 'collectd'
     super(ConfigureMonitoring, self).run()
Exemple #13
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        if "Node[]" not in self.parameters:
            try:
                integration_id_index_key = \
                    "indexes/tags/tendrl/integration/%s" % integration_id
                _node_ids = NS._int.client.read(
                    integration_id_index_key).value
                self.parameters["Node[]"] = json.loads(_node_ids)
            except etcd.EtcdKeyNotFound:
                raise FlowExecutionFailedError("Cluster with "
                                               "integration_id "
                                               "(%s) not found, cannot "
                                               "import" % integration_id)
            else:
                # TODO(shtripat) ceph-installer is auto detected and
                #  provisioner/$integration_id
                # tag is set , below is not required for ceph
                current_tags = list(NS.node_context.tags)
                new_tags = ['provisioner/%s' % integration_id]
                new_tags += current_tags
                new_tags = list(set(new_tags))
                if new_tags != current_tags:
                    NS.node_context.tags = new_tags
                    NS.node_context.save()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                _cluster.enable_volume_profiling = self.parameters[
                    'Cluster.enable_volume_profiling']
                _cluster.save()
        super(ImportCluster, self).run()
Exemple #14
0
def expand_gluster(parameters):
    node_ips = get_node_ips(parameters)
    plugin = NS.gluster_provisioner.get_plugin()
    cluster = NS.tendrl.objects.Cluster(
        integration_id=parameters['TendrlContext.integration_id']).load()
    logger.log(
        "info",
        NS.publisher_id,
        {
            "message":
            "Setting up gluster nodes for cluster %s" % cluster.short_name
        },
        job_id=parameters['job_id'],
        flow_id=parameters['flow_id'],
    )

    ret_val = plugin.setup_gluster_node(node_ips,
                                        repo=NS.config.data.get(
                                            'glusterfs_repo', None))
    if ret_val is not True:
        raise FlowExecutionFailedError("Error setting up gluster node")
    logger.log(
        "info",
        NS.publisher_id,
        {"message": "Expanding gluster cluster %s" % cluster.short_name},
        job_id=parameters['job_id'],
        flow_id=parameters['flow_id'])
    failed_nodes = []
    for node in node_ips:
        ret_val = plugin.expand_gluster_cluster(node)
        if not ret_val:
            failed_nodes.append(node)

    if failed_nodes:
        raise FlowExecutionFailedError(
            "Error expanding gluster cluster. Following nodes failed: %s" %
            ",".join(failed_nodes))
    logger.log("info",
               NS.publisher_id, {
                   "message":
                   "Expanded Gluster Cluster %s"
                   " with nodes %s" % (cluster.short_name, ",".join(node_ips))
               },
               job_id=parameters['job_id'],
               flow_id=parameters['flow_id'])
Exemple #15
0
    def run(self):
        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=self.parameters['Volume.vol_id']).load()
        if 'job_id' in volume.locked_by \
            and 'status' in volume.current_job \
            and volume.current_job['status'] in ['in_progress']:
            raise FlowExecutionFailedError(
                "Another job in progress for volume."
                " Please wait till the job finishes "
                "(job_id: %s) (volume: %s) (integration_id: %s) " %
                (volume.current_job['job_id'], volume.name,
                 NS.tendrl_context.integration_id))
        _lock_details = {
            'node_id': NS.node_context.node_id,
            'fqdn': NS.node_context.fqdn,
            'tags': NS.node_context.tags,
            'type': NS.type,
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        volume.locked_by = _lock_details
        volume.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': "in_progress"
        }
        volume.save()

        try:
            super(StopProfiling, self).run()
        except (FlowExecutionFailedError, AtomExecutionFailedError,
                Exception) as ex:
            volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=self.parameters['Volume.vol_id']).load()
            volume.current_job = {
                'job_id': self.job_id,
                'job_name': self.__class__.__name__,
                'status': "failed"
            }
            volume.locked_by = {}
            volume.save(update=False)
            raise ex

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=self.parameters['Volume.vol_id']).load()
        volume.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': "finished"
        }
        volume.locked_by = {}
        volume.save(update=False)
        return True
Exemple #16
0
 def run(self):
     try:
         all_node_status_up = True
         # check job is parent or child
         job = NS.tendrl.objects.Job(
             job_id=self.parameters['job_id']).load()
         if "parent" not in job.payload:
             # fetch node id using integration_id
             integration_id = self.parameters[
                 'TendrlContext.integration_id']
             key = "indexes/tags/tendrl/integration/%s" % \
                 integration_id
             node_ids_str = etcd_utils.read(key).value
             node_ids = json.loads(node_ids_str)
             # identifying node status using node_id
             logger.log(
                 "info",
                 NS.publisher_id,
                 {"message": "Checking if nodes %s are up" % str(node_ids)},
                 job_id=self.parameters['job_id'],
                 flow_id=self.parameters['flow_id'])
             nodes_up = []
             nodes_down = []
             for node in node_ids:
                 node = str(node)
                 # if node_context not found it will give status DOWN
                 node_context = NS.tendrl.objects.NodeContext(
                     node_id=node, status='DOWN').load()
                 if node_context.status == "UP":
                     nodes_up.append(node)
                 else:
                     all_node_status_up = False
                     nodes_down.append(node)
             if all_node_status_up:
                 logger.log(
                     "info",
                     NS.publisher_id,
                     {"message": "Status of nodes %s are up" % nodes_up},
                     job_id=self.parameters['job_id'],
                     flow_id=self.parameters['flow_id'])
             else:
                 logger.log("info",
                            NS.publisher_id, {
                                "message":
                                "Status of nodes %s are down" % nodes_down
                            },
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'])
         # no need to check for child job
         return all_node_status_up
     except (etcd.EtcdKeyNotFound, TypeError) as ex:
         raise FlowExecutionFailedError(
             "Error checking status of nodes .error: %s" % str(ex))
Exemple #17
0
    def _execute_atom(self, atom_fqdn):
        try:
            ns, atom_name = atom_fqdn.split(".atoms.")
            ns, obj_name = ns.split(".objects.")
            ns_str = ns.split(".")[-1]

            if "integrations" in ns:
                current_ns = getattr(NS.integrations, ns_str)
            else:
                current_ns = getattr(NS, ns_str)

            runnable_atom = current_ns.ns.get_atom(obj_name, atom_name)
            try:
                ret_val = runnable_atom(parameters=self.parameters).run()
                return ret_val
            except AtomExecutionFailedError:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                raise FlowExecutionFailedError(
                    str(
                        traceback.format_exception(exc_type, exc_value,
                                                   exc_traceback)))

        except (KeyError, AttributeError) as ex:
            _msg = "Could not find atom {0}".format(atom_fqdn)
            logger.log("error",
                       NS.publisher_id, {"message": _msg},
                       job_id=self.job_id,
                       flow_id=self.parameters['flow_id'])

            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": _msg,
                                     "exception": ex
                                 }))

        return False
Exemple #18
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        if integration_id is None:
            _msg = "TendrlContext.integration_id cannot be empty"
            raise FlowExecutionFailedError(_msg)
        if "Cluster.node_configuration" not in self.parameters.keys():
            _msg = "Cluster.node_configuration cannot be empty"
            raise FlowExecutionFailedError(_msg)

        ssh_job_ids = []
        ssh_setup_script = NS.ceph_provisioner.get_plugin().setup()
        for node_id in self.parameters["Cluster.node_configuration"].keys():
            new_params = dict()
            new_params['Node[]'] = [node_id]
            new_params['ssh_setup_script'] = ssh_setup_script
            payload = {
                "tags": ["tendrl/node_%s" % node_id],
                "run": "tendrl.flows.SetupSsh",
                "status": "new",
                "parameters": new_params,
                "parent": self.parameters['job_id'],
                "type": "node"
            }
            _job_id = str(uuid.uuid4())
            Job(job_id=_job_id, status="new", payload=payload).save()
            ssh_job_ids.append(_job_id)
            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Created SSH setup job %s for node"
                            " %s" % (_job_id, node_id)
                        }))
        while True:
            time.sleep(3)
            all_status = {}
            for job_id in ssh_job_ids:
                # noinspection PyUnresolvedReferences
                all_status[job_id] = NS._int.client.read("/queue/%s/status" %
                                                         job_id).value

            _failed = {
                _jid: status
                for _jid, status in all_status.iteritems()
                if status == "failed"
            }
            if _failed:
                raise FlowExecutionFailedError(
                    "SSH setup failed for jobs %s cluster %s" %
                    (str(_failed), integration_id))
            if all([status == "finished" for status in all_status.values()]):
                Event(
                    Message(job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'],
                            priority="info",
                            publisher=NS.publisher_id,
                            payload={
                                "message": "SSH setup completed for all nodes"
                            }))
                break
        Event(
            Message(job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                    priority="info",
                    publisher=NS.publisher_id,
                    payload={
                        "message":
                        "Adding OSDs to ceph cluster %s" % integration_id
                    }))
        add_osds(self.parameters)
Exemple #19
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id).load()
        if _cluster.status is not None and _cluster.status != "" and \
            _cluster.status in ["importing", "unmanaging", "expanding"]:
            raise FlowExecutionFailedError(
                "Another job in progress for cluster, please wait till "
                "the job finishes (job_id: %s) (integration_id: %s) " %
                (_cluster.current_job['job_id'], integration_id))
        _lock_details = {
            'node_id': NS.node_context.node_id,
            'fqdn': NS.node_context.fqdn,
            'tags': NS.node_context.tags,
            'type': NS.type,
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.locked_by = _lock_details
        _cluster.status = "expanding"
        _cluster.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': 'in_progress'
        }
        _cluster.save()

        try:
            integration_id_index_key = \
                "indexes/tags/tendrl/integration/%s" % integration_id
            node_ids = etcd_utils.read(integration_id_index_key).value
            node_ids = json.loads(node_ids)
        except etcd.EtcdKeyNotFound:
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id).load()
            _cluster.locked_by = {}
            _cluster.status = "expand_pending"
            _cluster.current_job = {
                'job_id': self.job_id,
                'job_name': self.__class__.__name__,
                'status': 'failed'
            }
            _cluster.save()
            raise FlowExecutionFailedError("Cluster with integration_id "
                                           "(%s) not found, cannot "
                                           "import" % integration_id)

        job_ids = []
        new_peers = []
        # Remove the current node from list as its already participating
        # in cluster for sure
        node_ids.remove(NS.node_context.node_id)
        for node_id in node_ids:
            _cnc = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).load()
            if _cnc.is_managed not in [None, ""] \
                and _cnc.is_managed.lower() == "yes":
                continue

            params = {
                'TendrlContext.integration_id': integration_id,
                'Node[]': [node_id],
                'Cluster.volume_profiling_flag': _cluster.volume_profiling_flag
            }
            payload = {
                "tags": ["tendrl/node_%s" % node_id],
                "run": "tendrl.flows.ImportCluster",
                "status": "new",
                "parent": self.parameters['job_id'],
                "parameters": params,
                "type": "node"
            }
            _job_id = str(uuid.uuid4())
            NS.tendrl.objects.Job(job_id=_job_id,
                                  status="new",
                                  payload=payload).save()
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "ImportCluster %s (jobID: %s) : "
                           "importing host %s" %
                           (_cluster.short_name, _job_id, node_id)
                       },
                       job_id=self.parameters['job_id'])
            job_ids.append(_job_id)
            new_peers.append(node_id)

        loop_count = 0
        # Wait for (no of nodes) * 6 minutes for import to complete
        wait_count = len(job_ids) * 36
        while True:
            child_jobs_failed = []
            if loop_count >= wait_count:
                logger.log(
                    "info",
                    NS.publisher_id, {
                        "message":
                        "Import jobs not yet complete "
                        "on all new nodes %s on cluster %s. Timing out. " %
                        (str(node_ids), _cluster.short_name)
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'])
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=integration_id).load()
                _cluster.locked_by = {}
                _cluster.status = "expand_pending"
                _cluster.current_job = {
                    'job_id': self.job_id,
                    'job_name': self.__class__.__name__,
                    'status': 'failed'
                }
                _cluster.save()
                raise FlowExecutionFailedError(
                    "Failed to expand cluster with integration_id "
                    "(%s)" % integration_id)

            time.sleep(10)
            finished = True
            for job_id in job_ids:
                job = NS.tendrl.objects.Job(job_id=job_id).load()
                if job.status not in ["finished", "failed"]:
                    finished = False
                elif job.status == "failed":
                    child_jobs_failed.append(job.job_id)
            if finished:
                break
            else:
                loop_count += 1
                continue
        if len(child_jobs_failed) > 0:
            _msg = "Child jobs failed are %s" % child_jobs_failed
            logger.log("error",
                       NS.publisher_id, {"message": _msg},
                       job_id=self.parameters['job_id'],
                       flow_id=self.parameters['flow_id'])
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id).load()
            _cluster.status = "expand_pending"
            _cluster.locked_by = {}
            _cluster.current_job = {
                'status': "failed",
                'job_name': self.__class__.__name__,
                'job_id': self.job_id
            }
            _cluster.save()
            raise FlowExecutionFailedError(
                "Failed to expand cluster with integration_id "
                "(%s)" % integration_id)
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id).load()
        _cluster.status = ""
        _cluster.locked_by = {}
        _cluster.current_job = {
            'status': "finished",
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.save()

        logger.log("info",
                   NS.publisher_id, {
                       "message":
                       "Newly detected nodes: %s added to the "
                       "cluster %s)" % (str(new_peers), _cluster.short_name),
                   },
                   job_id=self.parameters['job_id'],
                   flow_id=self.parameters['flow_id'])
        return True
Exemple #20
0
    def run(self):
        try:
            # Lock nodes
            flow_utils.acquire_node_lock(self.parameters)
            integration_id = self.parameters['TendrlContext.integration_id']
            if integration_id is None:
                raise FlowExecutionFailedError(
                    "TendrlContext.integration_id cannot be empty")

            supported_sds = NS.compiled_definitions.get_parsed_defs(
            )['namespace.tendrl']['supported_sds']
            sds_name = self.parameters["TendrlContext.sds_name"]
            if sds_name not in supported_sds:
                raise FlowExecutionFailedError("SDS (%s) not supported" %
                                               sds_name)

            ssh_job_ids = []
            ssh_job_ids = \
                flow_utils.gluster_create_ssh_setup_jobs(
                    self.parameters,
                    skip_current_node=True
                )

            while True:
                time.sleep(3)
                all_status = {}
                for job_id in ssh_job_ids:
                    job = NS.tendrl.objects.Job(job_id=job_id).load()
                    all_status[job_id] = job.status

                _failed = {
                    _jid: status
                    for _jid, status in all_status.iteritems()
                    if status == "failed"
                }
                if _failed:
                    raise FlowExecutionFailedError(
                        "SSH setup failed for jobs %s cluster %s" %
                        (str(_failed), integration_id))
                if all(
                    [status == "finished" for status in all_status.values()]):
                    logger.log("info",
                               NS.publisher_id, {
                                   "message":
                                   "SSH setup completed for all "
                                   "nodes in cluster %s" % integration_id
                               },
                               job_id=self.parameters['job_id'],
                               flow_id=self.parameters['flow_id'])

                    break

            # SSH setup jobs finished above, now install sds
            # bits and create cluster
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Expanding Gluster Storage"
                           " Cluster %s" % integration_id
                       },
                       job_id=self.parameters['job_id'],
                       flow_id=self.parameters['flow_id'])
            gluster_help.expand_gluster(self.parameters)
            logger.log(
                "info",
                NS.publisher_id, {
                    "message":
                    "SDS install/config completed on newly "
                    "expanded nodes, Please wait while "
                    "tendrl-node-agents detect sds details on the newly "
                    "expanded nodes %s" % self.parameters['Node[]']
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id'])

            # Wait till detected cluster in populated for nodes
            while True:
                time.sleep(3)
                all_status = []
                detected_cluster = ""
                different_cluster_id = False
                dc = ""
                for node in self.parameters['Node[]']:
                    try:
                        dc = NS.tendrl.objects.DetectedCluster(
                            node_id=node).load()
                        if not detected_cluster:
                            detected_cluster = dc.detected_cluster_id
                        else:
                            if detected_cluster != dc.detected_cluster_id:
                                all_status.append(False)
                                different_cluster_id = True
                                break
                        all_status.append(True)
                    except etcd.EtcdKeyNotFound:
                        all_status.append(False)
                if different_cluster_id:
                    raise FlowExecutionFailedError(
                        "Seeing different detected cluster id in"
                        " different nodes. %s and %s" %
                        (detected_cluster, dc.detected_cluster_id))

                if all_status:
                    if all(all_status):
                        break

            # Create the params list for import cluster flow
            new_params = dict()
            new_params['Node[]'] = self.parameters['Node[]']
            new_params['TendrlContext.integration_id'] = integration_id

            # Get node context for one of the nodes from list
            dc = NS.tendrl.objects.DetectedCluster(
                node_id=self.parameters['Node[]'][0]).load()
            sds_pkg_name = dc.sds_pkg_name
            new_params['import_after_expand'] = True
            sds_pkg_version = dc.sds_pkg_version
            new_params['DetectedCluster.sds_pkg_name'] = \
                sds_pkg_name
            new_params['DetectedCluster.sds_pkg_version'] = \
                sds_pkg_version

            tags = []
            for node in self.parameters['Node[]']:
                tags.append("tendrl/node_%s" % node)
            payload = {
                "tags": tags,
                "run": "tendrl.flows.ImportCluster",
                "status": "new",
                "parameters": new_params,
                "parent": self.parameters['job_id'],
                "type": "node"
            }
            _job_id = str(uuid.uuid4())
            # release lock before import cluster
            flow_utils.release_node_lock(self.parameters)

            NS.tendrl.objects.Job(job_id=_job_id,
                                  status="new",
                                  payload=payload).save()
            logger.log(
                "info",
                NS.publisher_id, {
                    "message":
                    "Please wait while Tendrl imports ("
                    "job_id: %s) newly expanded "
                    "%s storage nodes in cluster %s" %
                    (_job_id, sds_pkg_name,
                     NS.tendrl.objects.Cluster(
                         integration_id=integration_id).load().short_name)
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id'])
        except Exception as ex:
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": ex.message,
                                     "exception": ex
                                 }))
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock if any exception came
            flow_utils.release_node_lock(self.parameters)
Exemple #21
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id).load()
        if _cluster.is_managed == "no":
            if _cluster.current_job['job_name'] == self.__class__.__name__ \
                and _cluster.current_job['status'] == 'finished':
                raise FlowExecutionFailedError(
                    "Cluster is already in un-managed state")
        if _cluster.current_job['status'] == 'in_progress' and \
            (
                'job_id' in _cluster.locked_by and
                _cluster.locked_by['job_id'] != ""
            ) and (
                _cluster.status in ['importing', 'unmanaging', 'expanding']
            ):
            raise FlowExecutionFailedError(
                "Another job in progress for cluster."
                " Please wait till the job finishes "
                "(job_id: %s) (integration_id: %s) " %
                (_cluster.current_job['job_id'], _cluster.integration_id))

        _lock_details = {
            'node_id': NS.node_context.node_id,
            'fqdn': NS.node_context.fqdn,
            'tags': NS.node_context.tags,
            'type': NS.type,
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.locked_by = _lock_details
        _cluster.status = "unmanaging"
        _cluster.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': "in_progress"
        }
        _cluster.save()

        try:
            super(UnmanageCluster, self).run()
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id).load()
            _cluster.status = ""
            _cluster.is_managed = "no"
            _cluster.locked_by = {}
            _cluster.errors = []
            _cluster.current_job = {
                'status': "finished",
                'job_name': self.__class__.__name__,
                'job_id': self.job_id
            }
            _cluster.save()
        except (FlowExecutionFailedError, AtomExecutionFailedError,
                Exception) as ex:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id).load()
            _cluster.status = ""
            _cluster.locked_by = {}
            _cluster.current_job = {
                'status': "failed",
                'job_name': self.__class__.__name__,
                'job_id': self.job_id
            }
            _errors = []
            if hasattr(ex, 'message'):
                _errors = [ex.message]
            else:
                _errors = [str(ex)]
            if _errors:
                _cluster.errors = _errors
            _cluster.save()
            raise FlowExecutionFailedError(
                str(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback)))
Exemple #22
0
    def run(self):
        try:
            # Lock nodes
            create_cluster_utils.acquire_node_lock(self.parameters)
            integration_id = self.parameters['TendrlContext.integration_id']
            sds_name = self.parameters['DetectedCluster.sds_pkg_name']

            if not self.parameters.get('import_after_expand', False) and \
                not self.parameters.get('import_after_create', False):

                # check if gdeploy in already provisioned in this cluster
                # if no it has to be provisioned here
                if sds_name.find("gluster") > -1 and \
                    not self.parameters.get("gdeploy_provisioned", False) and \
                    not self._probe_and_mark_provisioner(
                        self.parameters["Node[]"], integration_id
                    ):
                    create_cluster_utils.install_gdeploy()
                    create_cluster_utils.install_python_gdeploy()
                    ssh_job_ids = create_cluster_utils.gluster_create_ssh_setup_jobs(
                        self.parameters)

                    while True:
                        gevent.sleep(3)
                        all_status = {}
                        for job_id in ssh_job_ids:
                            all_status[job_id] = NS._int.client.read(
                                "/queue/%s/status" % job_id).value

                        _failed = {
                            _jid: status
                            for _jid, status in all_status.iteritems()
                            if status == "failed"
                        }
                        if _failed:
                            raise AtomExecutionFailedError(
                                "SSH setup failed for jobs %s cluster %s" %
                                (str(_failed), integration_id))
                        if all([
                                status == "finished"
                                for status in all_status.values()
                        ]):
                            Event(
                                Message(
                                    job_id=self.parameters['job_id'],
                                    flow_id=self.parameters['flow_id'],
                                    priority="info",
                                    publisher=NS.publisher_id,
                                    payload={
                                        "message":
                                        "SSH setup completed for all nodes in cluster %s"
                                        % integration_id
                                    }))
                            # set this node as gluster provisioner
                            tags = ["provisioner/%s" % integration_id]
                            NS.node_context = NS.node_context.load()
                            tags += NS.node_context.tags
                            NS.node_context.tags = list(set(tags))
                            NS.node_context.save()

                            # set gdeploy_provisioned to true so that no other nodes
                            # tries to configure gdeploy
                            self.parameters['gdeploy_provisioned'] = True
                            break

            NS.tendrl_context = NS.tendrl_context.load()
            NS.tendrl_context.integration_id = integration_id
            _detected_cluster = NS.tendrl.objects.DetectedCluster().load()
            NS.tendrl_context.cluster_id = _detected_cluster.detected_cluster_id
            NS.tendrl_context.cluster_name = _detected_cluster.detected_cluster_name
            NS.tendrl_context.sds_name = _detected_cluster.sds_pkg_name
            NS.tendrl_context.sds_version = _detected_cluster.sds_pkg_version
            NS.tendrl_context.save()
            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Registered Node %s with cluster %s" %
                            (NS.node_context.node_id,
                             NS.tendrl_context.integration_id)
                        }))

            node_list = self.parameters['Node[]']
            cluster_nodes = []
            if len(node_list) > 1:
                # This is the master node for this flow
                for node in node_list:
                    if NS.node_context.node_id != node:
                        new_params = self.parameters.copy()
                        new_params['Node[]'] = [node]
                        # create same flow for each node in node list except $this
                        payload = {
                            "tags": ["tendrl/node_%s" % node],
                            "run": "tendrl.flows.ImportCluster",
                            "status": "new",
                            "parameters": new_params,
                            "parent": self.parameters['job_id'],
                            "type": "node"
                        }
                        _job_id = str(uuid.uuid4())
                        cluster_nodes.append(_job_id)
                        Job(job_id=_job_id, status="new",
                            payload=payload).save()
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Importing (job: %s) Node %s to cluster %s"
                                    % (_job_id, node, integration_id)
                                }))

            if "ceph" in sds_name.lower():
                node_context = NS.node_context.load()
                is_mon = False
                for tag in node_context.tags:
                    mon_tag = NS.compiled_definitions.get_parsed_defs(
                    )['namespace.tendrl']['tags']['ceph-mon']
                    if mon_tag in tag:
                        is_mon = True
                if is_mon:
                    # Check if minimum required version of underlying ceph
                    # cluster met. If not fail the import task
                    detected_cluster = NS.tendrl.objects.DetectedCluster(
                    ).load()
                    detected_cluster_ver = detected_cluster.sds_pkg_version.split(
                        '.')
                    maj_ver = detected_cluster_ver[0]
                    min_ver = detected_cluster_ver[1]
                    reqd_ceph_ver = NS.compiled_definitions.get_parsed_defs(
                    )['namespace.tendrl']['min_reqd_ceph_ver']
                    req_maj_ver, req_min_ver, req_rel = reqd_ceph_ver.split(
                        '.')
                    Event(
                        Message(
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'],
                            priority="info",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Check: Minimum required version (%s.%s.%s) of Ceph Storage"
                                % (req_maj_ver, req_min_ver, req_rel)
                            }))
                    if int(maj_ver) < int(req_maj_ver) or \
                        int(min_ver) < int(req_min_ver):
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="error",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Error: Minimum required version (%s.%s.%s) "
                                    "doesnt match that of detected Ceph Storage (%s.%s.%s)"
                                    % (req_maj_ver, req_min_ver, req_rel,
                                       maj_ver, min_ver, 0)
                                }))

                        raise FlowExecutionFailedError(
                            "Detected ceph version: %s"
                            " is lesser than required version: %s" %
                            (detected_cluster.sds_pkg_version, reqd_ceph_ver))
                    import_ceph(self.parameters)
            else:
                # Check if minimum required version of underlying gluster
                # cluster met. If not fail the import task
                detected_cluster = NS.tendrl.objects.DetectedCluster().load()
                detected_cluster_ver = detected_cluster.sds_pkg_version.split(
                    '.')
                maj_ver = detected_cluster_ver[0]
                min_ver = detected_cluster_ver[1]
                reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs(
                )['namespace.tendrl']['min_reqd_gluster_ver']
                req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.')
                Event(
                    Message(
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Check: Minimum required version (%s.%s.%s) of Gluster Storage"
                            % (req_maj_ver, req_min_ver, req_rel)
                        }))
                if int(maj_ver) < int(req_maj_ver) or \
                    int(min_ver) < int(req_min_ver):
                    Event(
                        Message(
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'],
                            priority="error",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Error: Minimum required version (%s.%s.%s) "
                                "doesnt match that of detected Gluster Storage (%s.%s.%s)"
                                % (req_maj_ver, req_min_ver, req_rel, maj_ver,
                                   min_ver, 0)
                            }))

                    raise FlowExecutionFailedError(
                        "Detected gluster version: %s"
                        " is lesser than required version: %s" %
                        (detected_cluster.sds_pkg_version, reqd_gluster_ver))
                import_gluster(self.parameters)

            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Waiting for participant nodes %s to be "
                            "imported %s" % (node_list, integration_id)
                        }))

            # An import is sucessfull once all Node[] register to
            # /clusters/:integration_id/nodes/:node_id
            while True:
                _all_node_status = []
                gevent.sleep(3)
                for node_id in self.parameters['Node[]']:
                    _status = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).exists() \
                        and NS.tendrl.objects.ClusterTendrlContext(
                            integration_id=integration_id
                        ).exists()
                    _all_node_status.append(_status)
                if _all_node_status:
                    if all(_all_node_status):
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Import Cluster completed for all nodes "
                                    "in cluster %s" % integration_id
                                }))

                        break

            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Sucessfully imported cluster %s" % integration_id
                        }))
        except Exception as ex:
            # For traceback
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": ex.message,
                                     "exception": ex
                                 }))
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock
            create_cluster_utils.release_node_lock(self.parameters)

        return True
Exemple #23
0
    def run(self):
        action = self.parameters["Cluster.volume_profiling_flag"]
        if action not in VOL_PROFILE_ACTIONS.keys():
            raise FlowExecutionFailedError(
                "Invalid value of Cluster.volume_profiling_flag "
                "(%s) while enable/disable volume profiling for"
                "cluster (%s). Valid values are enable/disable" %
                (
                    action,
                    NS.tendrl_context.integration_id
                )
            )

        _cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        _lock_details = {
            'node_id': NS.node_context.node_id,
            'tags': NS.node_context.tags,
            'type': NS.type,
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.locked_by = _lock_details
        _cluster.status = "set_volume_profiling"
        _cluster.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': 'in_progress'
        }
        _cluster.save()

        volumes = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id
        ).load_all() or []
        failed_vols = []
        for volume in volumes:
            out, err, rc = cmd_utils.Command(
                "gluster volume profile %s %s" %
                (volume.name, VOL_PROFILE_ACTIONS[action])
            ).run()
            if err != "" or rc != 0:
                logger.log(
                    "info",
                    NS.publisher_id,
                    {
                        "message": "%s profiling failed for volume: %s."
                        " Error: %s" %
                        (action, volume.name, err)
                    },
                    job_id=self.parameters["job_id"],
                    flow_id=self.parameters["flow_id"]
                )
                failed_vols.append(volume.name)
            else:
                if action == "enable":
                    volume.profiling_enabled = "yes"
                else:
                    volume.profiling_enabled = "no"
                volume.save()
        if len(failed_vols) > 0:
            logger.log(
                "info",
                NS.publisher_id,
                {
                    "message": "%s profiling failed for "
                    "volumes: %s" % (action, str(failed_vols))
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters["flow_id"]
            )

        _cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        _cluster.status = ""
        _cluster.locked_by = {}
        _cluster.current_job = {
            'status': "finished",
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.volume_profiling_state = "%sd" % action
        _cluster.save()
        return True
Exemple #24
0
    def run(self):
        # Execute the pre runs for the flow
        msg = "Processing pre-runs for flow: %s" % self.to_str
        Event(
            Message(priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": msg}))
        # Check for mandatory parameters
        if 'mandatory' in self._defs.get('inputs', {}):
            for item in self._defs['inputs']['mandatory']:
                if item not in self.parameters:
                    msg = "Mandatory parameter %s not provided" % item
                    Event(
                        Message(job_id=self.job_id,
                                flow_id=self.parameters['flow_id'],
                                priority="warning",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))

                    raise FlowExecutionFailedError("Mandatory parameter %s "
                                                   "not provided" % item)

        if self._defs.get("pre_run") is not None:
            for atom_fqn in self._defs.get("pre_run"):
                msg = "Start pre-run : %s" % atom_fqn
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": msg}))

                ret_val = self._execute_atom(atom_fqn)

                if not ret_val:
                    msg = "Failed pre-run: %s for flow: %s" % \
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(job_id=self.job_id,
                                flow_id=self.parameters['flow_id'],
                                priority="error",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))
                    raise AtomExecutionFailedError(
                        "Error executing pre run function: %s for flow: %s" %
                        (atom_fqn, self._defs['help']))
                else:
                    msg = "Finished pre-run: %s for flow: %s" %\
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(priority="debug",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))

        # Execute the atoms for the flow
        msg = "Processing atoms for flow: %s" % self._defs['help']
        Event(
            Message(priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": msg}))

        if self._defs.get("atoms") is not None:
            for atom_fqn in self._defs.get("atoms"):
                msg = "Start atom : %s" % atom_fqn
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": msg}))

                ret_val = self._execute_atom(atom_fqn)

                if not ret_val:
                    msg = "Failed atom: %s on flow: %s" % \
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(job_id=self.job_id,
                                flow_id=self.parameters['flow_id'],
                                priority="error",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))
                    raise AtomExecutionFailedError(
                        "Error executing atom: %s on flow: %s" %
                        (atom_fqn, self._defs['help']))
                else:
                    msg = 'Finished atom %s for flow: %s' %\
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(priority="debug",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))

        # Execute the post runs for the flow
        msg = "Processing post-runs for flow: %s" % self._defs['help']
        Event(
            Message(priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": msg}))
        if self._defs.get("post_run") is not None:
            for atom_fqn in self._defs.get("post_run"):
                msg = "Start post-run : %s" % atom_fqn
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": msg}))

                ret_val = self._execute_atom(atom_fqn)

                if not ret_val:
                    msg = "Failed post-run: %s for flow: %s" % \
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(job_id=self.job_id,
                                flow_id=self.parameters['flow_id'],
                                priority="error",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))
                    raise AtomExecutionFailedError(
                        "Error executing post run function: %s" % atom_fqn)
                else:
                    msg = "Finished post-run: %s for flow: %s" %\
                          (atom_fqn, self._defs['help'])
                    Event(
                        Message(priority="debug",
                                publisher=NS.publisher_id,
                                payload={"message": msg}))
Exemple #25
0
    def run(self):
        if "Node[]" not in self.parameters:
            integration_id = self.parameters['TendrlContext.integration_id']

            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            if (_cluster.import_job_id is not None and
                    _cluster.import_job_id != "") or _cluster.import_status \
                    in ["in_progress", "done", "failed"]:
                raise FlowExecutionFailedError(
                    "Cluster already being imported by another Job, please "
                    "wait till "
                    "the job finishes (job_id: %s) (integration_id: %s) " %
                    (_cluster.import_job_id, _cluster.integration_id))

            _cluster.import_status = "in_progress"
            _cluster.import_job_id = self.job_id
            _cluster.save()

            try:
                integration_id_index_key = \
                    "indexes/tags/tendrl/integration/%s" % integration_id
                _node_ids = NS._int.client.read(integration_id_index_key).value
                self.parameters["Node[]"] = json.loads(_node_ids)
            except etcd.EtcdKeyNotFound:
                raise FlowExecutionFailedError("Cluster with "
                                               "integration_id "
                                               "(%s) not found, cannot "
                                               "import" % integration_id)
            else:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                _cluster.enable_volume_profiling = self.parameters[
                    'Cluster.enable_volume_profiling']
                _cluster.save()

                # Try to claim "provisioner/:integration_id" tag
                try:
                    _tag = "provisioner/%s" % _cluster.integration_id
                    _index_key = "/indexes/tags/%s" % _tag
                    _node_id = json.dumps([NS.node_context.node_id])
                    NS._int.wclient.write(_index_key,
                                          _node_id,
                                          prevExist=False)
                    # TODO(shtripat) ceph-installer is auto detected and
                    #  provisioner/$integration_id
                    # tag is set , below is not required for ceph
                    current_tags = list(NS.node_context.tags)
                    new_tags = ['provisioner/%s' % integration_id]
                    new_tags += current_tags
                    new_tags = list(set(new_tags))
                    if new_tags != current_tags:
                        NS.node_context.tags = new_tags
                        NS.node_context.save()
                except etcd.EtcdAlreadyExist:
                    pass

        try:
            super(ImportCluster, self).run()
        except (FlowExecutionFailedError, AtomExecutionFailedError,
                Exception) as ex:
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            _cluster.import_status = "failed"
            _errors = []
            if hasattr(ex, 'message'):
                _errors = [ex.message]
            else:
                _errors = [str(ex)]
            if _errors:
                _cluster.errors = _errors
            _cluster.save()
            raise ex
 def run(self):
     ssh_key = self.parameters['ssh_key']
     ret_val, err = authorize_key.AuthorizeKey(ssh_key).run()
     if ret_val is not True or err != "":
         raise FlowExecutionFailedError(err)
     return True
Exemple #27
0
    def run(self):
        try:
            integration_id = self.parameters['TendrlContext.integration_id']

            # Lock nodes
            create_cluster_utils.acquire_node_lock(self.parameters)
            NS.tendrl_context = NS.tendrl_context.load()

            # TODO(team) when Tendrl supports create/expand/shrink cluster
            # setup passwordless ssh for all gluster nodes with given
            # integration_id (check
            # /indexes/tags/tendrl/integration/$integration_id for list of
            # nodes in cluster

            node_list = self.parameters['Node[]']
            cluster_nodes = []
            if len(node_list) > 1:
                # This is the master node for this flow
                for node in node_list:
                    if NS.node_context.node_id != node:
                        new_params = self.parameters.copy()
                        new_params['Node[]'] = [node]
                        # create same flow for each node in node list except
                        #  $this
                        payload = {"tags": ["tendrl/node_%s" % node],
                                   "run": "tendrl.flows.ImportCluster",
                                   "status": "new",
                                   "parameters": new_params,
                                   "parent": self.parameters['job_id'],
                                   "type": "node"
                                   }
                        _job_id = str(uuid.uuid4())
                        cluster_nodes.append(_job_id)
                        Job(job_id=_job_id,
                            status="new",
                            payload=payload).save()
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message": "Importing (job: %s) Node %s "
                                               "to cluster %s" %
                                    (_job_id, node, integration_id)
                                }
                            )
                        )
            # Check if minimum required version of underlying gluster
            # cluster met. If not fail the import task
            cluster_ver = \
                NS.tendrl_context.sds_version.split('.')
            maj_ver = cluster_ver[0]
            min_ver = re.findall(r'\d+', cluster_ver[1])[0]
            reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs()[
                'namespace.tendrl'
            ]['min_reqd_gluster_ver']
            req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.')
            Event(
                Message(
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                    priority="info",
                    publisher=NS.publisher_id,
                    payload={
                        "message": "Check: Minimum required version ("
                                   "%s.%s.%s) of Gluster Storage" %
                        (req_maj_ver, req_min_ver, req_rel)
                    }
                )
            )
            ver_check_failed = False
            if int(maj_ver) < int(req_maj_ver):
                ver_check_failed = True
            else:
                if int(maj_ver) == int(req_maj_ver) and \
                        int(min_ver) < int(req_min_ver):
                        ver_check_failed = True

            if ver_check_failed:
                Event(
                    Message(
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={
                            "message": "Error: Minimum required version "
                                       "(%s.%s.%s) "
                            "doesnt match that of detected Gluster "
                                       "Storage (%s.%s.%s)" %
                            (req_maj_ver, req_min_ver, req_rel,
                             maj_ver, min_ver, 0)
                        }
                    )
                )

                raise FlowExecutionFailedError(
                    "Detected gluster version: %s"
                    " is lesser than required version: %s" %
                    (
                        NS.tendrl_context.sds_version,
                        reqd_gluster_ver
                    )
                )
            if not import_gluster(self.parameters):
                return False

            if len(node_list) > 1:
                Event(
                    Message(
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message": "Waiting for participant nodes %s to "
                                       "be "
                            "imported %s" % (node_list, integration_id)
                        }
                    )
                )
                loop_count = 0
                # Wait for (no of nodes) * 6 minutes for import to complete
                wait_count = (len(node_list) - 1) * 36
                while True:
                    parent_job = Job(job_id=self.parameters['job_id']).load()
                    if loop_count >= wait_count:
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message": "Import jobs not yet complete "
                                    "on all nodes. Timing out. (%s, %s)" %
                                    (str(node_list), integration_id)
                                }
                            )
                        )
                        return False
                    time.sleep(10)
                    finished = True
                    for child_job_id in parent_job.children:
                        child_job = Job(job_id=child_job_id).load()
                        if child_job.status != "finished":
                            finished = False
                            break
                    if finished:
                        break
                    else:
                        loop_count += 1
                        continue

        except Exception as ex:
            # For traceback
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={
                        "message": ex.message,
                        "exception": ex
                    }
                )
            )
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock
            create_cluster_utils.release_node_lock(self.parameters)

        return True
Exemple #28
0
 def mock_invoke_flow(flow, job):
     raise FlowExecutionFailedError("Flow Execution failed")
Exemple #29
0
def process_job(job):
    jid = job.key.split('/')[-1]
    job_status_key = "/queue/%s/status" % jid
    job_lock_key = "/queue/%s/locked_by" % jid
    NS.node_context = NS.node_context.load()
    # Check job not already locked by some agent
    try:
        _locked_by = etcd_utils.read(job_lock_key).value
        if _locked_by:
            return
    except etcd.EtcdKeyNotFound:
        pass

    # Check job not already "finished", or "processing"
    try:
        _status = etcd_utils.read(job_status_key).value
        if _status in ["finished", "processing"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _job_timeout_key = "/queue/%s/timeout" % jid
        _timeout = None
        _timeout = etcd_utils.read(_job_timeout_key).value
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes":
        _job_valid_until_key = "/queue/%s/valid_until" % jid
        _valid_until = None
        try:
            _valid_until = etcd_utils.read(
                _job_valid_until_key).value
        except etcd.EtcdKeyNotFound:
            pass

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    etcd_utils.write(job_status_key,
                                     "failed",
                                     prevValue="new")
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    _msg = str("Timed-out (>10min as 'new')")
                    job.errors = _msg
                    job.save()
                    if job.payload.get('parent') is None:
                        alert_utils.alert_job_status(
                            "failed",
                            "Job timed out (job_id: %s)" % jid,
                            integration_id=NS.tendrl_context.integration_id or
                            job.payload['parameters'].get(
                                'TendrlContext.integration_id'
                            ),
                            cluster_name=NS.tendrl_context.cluster_name or
                            job.payload['parameters'].get(
                                'TendrlContext.cluster_name'
                            )
                        )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            # noinspection PyTypeChecker
            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            etcd_utils.write(_job_valid_until_key,
                             int(_now_plus_10_epoch))

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        job_status_key = "/queue/%s/status" % job.job_id
        job_lock_key = "/queue/%s/locked_by" % job.job_id
        try:
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             tags=NS.node_context.tags,
                             type=NS.type)
            etcd_utils.write(job_status_key, "processing",
                             prevValue="new")
            etcd_utils.write(job_lock_key,
                             json.dumps(lock_info))
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Processing Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running Flow %s" %
                            job.payload['run']},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )
            the_flow.run()
            try:
                etcd_utils.write(job_status_key,
                                 "finished",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s):  Finished "
                            "Flow %s" % (
                                job.job_id,
                                job.payload['run'])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "Job finished successfully (job_id: %s)" % job.job_id,
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                etcd_utils.write(job_status_key,
                                 "failed",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
Exemple #30
0
    def run(self):
        super(StopServices, self).run()
        services = self.parameters['Services[]']
        for service in services:
            logger.log(
                "info",
                NS.publisher_id,
                {
                    "message":
                    "Stopping service %s on node %s" %
                    (service, NS.node_context.fqdn)
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id'],
            )
            srv = NS.tendrl.objects.Service(service=service)
            if not srv.running:
                if len(srv.error) > 0:
                    raise FlowExecutionFailedError(
                        "Unable to check status of service %s "
                        "on %s. Error: %s" %
                        (service, NS.node_context.node_id, srv.error))
                else:
                    logger.log(
                        "debug",
                        NS.publisher_id,
                        {
                            "message":
                            "%s not running on "
                            "%s" % (service, NS.node_context.fqdn)
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                    )
                continue

            _cmd_str = "systemctl stop %s" % service
            cmd = cmd_utils.Command(_cmd_str)
            _, err, _ = cmd.run()
            if err:
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message":
                        "Could not stop %s"
                        " service on %s. Error: %s" %
                        (service, NS.node_context.fqdn, err)
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )

            _cmd_str = "systemctl disable %s" % service
            cmd = cmd_utils.Command(_cmd_str)
            _, err, _ = cmd.run()
            if err:
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message":
                        "Could not disable %s"
                        " service on %s. Error: %s" %
                        (service, NS.node_context.fqdn, err)
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )

        return True