def add_cache(args):
    if not configuration().get_node_info().is_storage:
        print("-1")
        return

    job_manager = JobManager()
    params = '-disk_name {} -partitions {}'.format(args.disk_name,
                                                   args.partitions)

    # Getting all running jobs :
    for j in job_manager.get_running_job_list():
        if j.type == JobType.DELETEOSD or j.type == JobType.ADDDISK or \
                        j.type == JobType.ADDJOURNAL or j.type == JobType.DELETEJOURNAL or \
                        j.type == JobType.ADDCACHE or j.type == JobType.DELETECACHE:
            logger.info(
                "Cannot start add cache job for disk {}. There ara running jobs."
                .format(args.disk_name))
            print("-1")
            return

    print(job_manager.add_job(JobType.ADDCACHE, params))
    logger.info("Start add cache job for disk {}.".format(args.disk_name))
    sys.exit()
Example #2
0
    def __get_disks(self):
        all_disks_dict = {}
        osd_disks = []
        journal_disks = []
        disk_list = []
        ceph_disk_list = ceph_disk_lib.get_disk_list()
        ph_disk_list = disk_util.get_disk_list()
        osd_dict = ceph_osd.ceph_osd_tree(configuration().get_node_info().name)
        try:
            if ceph_disk_list and len(ceph_disk_list) > 0:
                for disk in ceph_disk_list:
                    for ph_disk in ph_disk_list:
                        if ph_disk.name == disk.name:
                            ph_disk.usage = disk.usage
                            ph_disk.osd_id = disk.osd_id
                            ph_disk.osd_uuid = disk.osd_uuid
                            disk_list.append(ph_disk)
                            break

            for node_disk in disk_list:
                if node_disk.usage == DiskUsage.osd:
                    status = None
                    if osd_dict:
                        status = osd_dict.get(int(node_disk.osd_id), None)
                        if status != None:
                            node_disk.status = status
                            if status == 1:
                                osd_disks.append(node_disk.name)

                elif node_disk.usage == DiskUsage.journal:
                    journal_disks.append(node_disk.name)
            all_disks_dict = {"OSDs": osd_disks, "Journals": journal_disks}
        except Exception as ex:
            logger.error("Cannot get node disks.")
            logger.exception(ex.message)
        return all_disks_dict
Example #3
0
    def add__node_to_hosts_file(self, remote=True):
        path = "/etc/hosts"

        cluster_conf = configuration()
        current_node = cluster_conf.get_node_info()

        try:
            with open(path, mode='a') as f:
                f.write("{ip}   {host}\n".format(ip=current_node.management_ip,
                                                 host=current_node.name))
                if remote:
                    f.write("127.0.0.1   localhost\n")
                    for node in cluster_conf.get_remote_nodes_config(
                            current_node.name):
                        f.write("{ip}   {host}\n".format(ip=node.management_ip,
                                                         host=node.name))

            if FileSyncManager().commit_file(path):
                return True

        except Exception as e:
            logger.error("Could not write hosts file.")

        return False
def __test_mons():
    sleeps = [15, 15, 10, 10, 5, 5]
    tries = 5
    mon_in_quorum = []
    mon_members = []

    cluster_conf = configuration()
    current_cluster_info = cluster_conf.get_cluster_info()

    for i in current_cluster_info.management_nodes:
        node_info = NodeInfo()
        node_info.load_json(json.dumps(i))
        mon_members.append(node_info.name)

    for host in mon_members:
        while tries:
            status = mon_status_check()
            has_reached_quorum = host in status.get('quorum_names', '')

            if not has_reached_quorum:
                tries -= 1
                sleep_seconds = sleeps.pop()
                logger.warning('Waiting %s seconds before retrying',
                               sleep_seconds)
                time.sleep(sleep_seconds)
            else:
                mon_in_quorum.append(host)
                break

    if mon_in_quorum == mon_members:
        logger.info("Ceph monitors are ready.")
        return True

    else:
        logger.info("Ceph monitors are not ready.")
        return False
Example #5
0
    def __get_ifaces(self):
        config = configuration()
        bonds = config.get_cluster_bonds()
        eths = []
        bond_names =[]
        try:
            for bond in bonds:
                eths.extend(bond.interfaces.split(','))
                bond_names.append(bond.name)
            eths.extend(bond_names)
            cluster_info =config.get_cluster_info()
            if not cluster_info.backend_1_eth_name in bond_names and  not cluster_info.backend_1_eth_name in eths:
                eths.append(cluster_info.backend_1_eth_name)
            if not cluster_info.backend_2_eth_name in bond_names and  not cluster_info.backend_2_eth_name in eths:
                eths.append(cluster_info.backend_2_eth_name)
            if not cluster_info.management_eth_name in bond_names and  not cluster_info.management_eth_name in eths:
                eths.append(cluster_info.management_eth_name)
        except Exception as ex:
            logger.error("Cannot get node ifaces.")
            logger.exception(ex.message)
        return eths


        pass
Example #6
0
    def update_auth_pools(self, user_name, pool_list):
        config = configuration()
        cluster_name = config.get_cluster_name()
        pool_string = ""
        if len(pool_list) > 0:
            for pool in pool_list:
                pool_string += "\'profile rbd pool=" + pool + "\',"
            if pool_string[-1] == ",":
                pool_string = pool_string[:-1]
        else:
            pool_string = "\'profile rbd\'"

        cmd = "ceph auth caps client.{} mgr 'allow r' mon 'profile rbd' osd {} --cluster {}".format(
            user_name, pool_string, cluster_name)
        ret, stdout, stderr = exec_command_ex(cmd)

        if ret != 0:
            if stderr:
                logger.error('failed to run cmd ' + cmd)
                return False

            return False

        return True
def readImageMetaData(ioctx, image, pool):
    ret = None

    # Get which ceph user is using this function & get his keyring file path #
    ceph_auth = CephAuthenticator()

    config = configuration()
    cluster_name = config.get_cluster_name()

    try:
        cmd = "rbd info " + pool + "/" + str(
            image) + " " + ceph_auth.get_authentication_string(
            ) + " --cluster " + cluster_name + " | grep rbd_data"
        ret, stdout, stderr = exec_command_ex(cmd)

        if ret != 0:
            if stderr:
                logger.error("Cannot get image meta object from rbd header.")
                return None

        rbd_data = stdout.rstrip().strip()
        dot_indx = rbd_data.rfind(".")

        image_id = rbd_data[(dot_indx + 1):]

        rbd_header_object = "rbd_header." + image_id

        try:
            ret = ioctx.get_xattr(rbd_header_object, meta_key)
        except:
            ret = ioctx.get_xattr(rbd_header_object[:-1], meta_key)

    except:
        return None

    return ret
Example #8
0
'''

from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.common.cmd import *
from PetaSAN.core.entity.cluster import NodeInfo
import json
import argparse
from PetaSAN.core.common.log import logger
from PetaSAN.core.common.cmd import exec_command_ex, call_cmd

MAX_OPEN_FILES = 102400

parser = argparse.ArgumentParser(description='This is a script that will start up the configured consul client.')
join = ''

for node in configuration().get_remote_nodes_config(""):
    join= join + " -retry-join {} ".format(node.backend_1_ip)

logger.info("consul start up string {}".format(join))
str_start_command = "consul agent -config-dir /opt/petasan/config/etc/consul.d/client "

str_start_command = str(str_start_command)+ join+' >/dev/null 2>&1 &'

subprocess.Popen(str_start_command, shell=True)

# Increase max open files for Consul process :
# ============================================
pid_cmd = "ps aux | grep consul | grep agent"
ret, stdout, stderr = exec_command_ex(pid_cmd)

line_1 = stdout.splitlines()[0]
Example #9
0
#!/usr/bin/python
'''
 Copyright (C) 2019 Maged Mokhtar <mmokhtar <at> petasan.org>
 Copyright (C) 2019 PetaSAN www.petasan.org


 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU Affero General Public License
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.config.api import ConfigAPI

cluster_info = configuration().get_cluster_info()
cluster_info.name = "ceph"
config = ConfigAPI()

with open(
        config.get_cluster_info_file_path(),
        'w',
) as f:
    f.write(cluster_info.write_json())
    def add_user(self, user_name, auth_pools):
        backup_nodes_list = []
        nodes_list = ManageNode().get_node_list()
        for node_info in nodes_list:
            if node_info.is_backup:
                backup_nodes_list.append(node_info.name)

        user_info = ConsulAPI().get_replication_user(user_name)
        if user_info and len(user_info.user_name) > 0:
            raise ReplicationException(
                ReplicationException.SYSTEM_USER_EXIST,
                'ThisSystemUserAlreadyExistsInNodes:{}'.format(
                    backup_nodes_list))

        user = Users()
        ceph_usr_stat = user.is_ceph_user_exist(user_name)
        if ceph_usr_stat:
            raise ReplicationException(ReplicationException.CEPH_USER_EXIST,
                                       'ThisCephUserAlreadyExists')

        cluster_name = configuration().get_cluster_name()
        ceph_keyring_path = '/etc/ceph/{}.client.{}.keyring'.format(
            cluster_name, user_name)

        replication_user = ReplicationUser()
        replication_user.user_name = user_name
        replication_user.auth_pools = auth_pools

        rsa_encrypt = RSAEncryption()
        rsa_pub_key = rsa_encrypt.get_key(rsa_encrypt.pub_key_path)

        user.add_ceph_user(user_name, auth_pools)

        ceph_keyring_value = self.get_file_content(ceph_keyring_path)
        enc_ceph_keyring = rsa_encrypt.encrypt_public(ceph_keyring_value,
                                                      rsa_pub_key)
        replication_user.ceph_keyring = enc_ceph_keyring

        pub_file = ConfigAPI().get_replication_user_pubkey_file_path()
        prv_file = ConfigAPI().get_replication_user_prvkey_file_path()
        rep_path = ConfigAPI().get_replication_tmp_file_path()

        if not os.path.exists(rep_path):
            os.mkdir(rep_path)

        user.generate_tmp_ssh_keys(pub_file, prv_file)

        pub_key_value = self.get_file_content(pub_file)
        replication_user.ssh_pub_key = pub_key_value

        prv_key_value = self.get_file_content(prv_file)
        enc_prv_key = rsa_encrypt.encrypt_public(prv_key_value, rsa_pub_key)
        replication_user.ssh_prv_key = enc_prv_key

        mng_file = ManageTmpFile()
        mng_file.delete_tmp_file(pub_file)
        mng_file.delete_tmp_file(prv_file)

        consul = ConsulAPI()
        consul.update_replication_user(replication_user)

        for node in backup_nodes_list:
            stat = self.sync_users(node)
            if not stat:
                logger.error("error sync users on the node {}".format(node))
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

import os
import sys
from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.common.log import logger
from PetaSAN.core.config.api import ConfigAPI
from PetaSAN.core.common.cmd import call_cmd

cluster_name = configuration().get_cluster_name()
ceph_mon_keyring = ConfigAPI().get_ceph_mon_keyring(cluster_name)
ceph_client_admin_keyring = ConfigAPI().get_ceph_keyring_path(cluster_name)

try:

    cluster_conf = configuration()
    current_node_info = cluster_conf.get_node_info()
    current_node_name = current_node_info.name
    current_cluster_info = cluster_conf.get_cluster_info()
    config_api = ConfigAPI()

    os.makedirs("/var/lib/ceph/mon/{}-{}".format(cluster_name,
                                                 current_node_name))

    os.makedirs("/tmp/{}".format(current_node_name))
def add_job():
    """
    DOCSTRING : this function is called when opening the Add new replication job form page.
    Args : None
    Returns : render to the template page : 'admin/replication/add_replication_job.html'
    """
    if request.method == 'GET' or request.method == 'POST':
        try:
            backup_nodes = []
            destination_clusters_list = []
            manage_node = ManageNode()
            nodes = manage_node.get_node_list()
            cluster_name = configuration().get_cluster_name(custom_name=True)
            manage_destination_cluster = ManageDestinationCluster()
            destination_clusters_dict = manage_destination_cluster.get_replication_dest_clusters(
            )
            for dest_cluster in destination_clusters_dict:
                destination_clusters_list.append(dest_cluster)
            destination_clusters_list.sort()
            for node in nodes:
                if node.is_backup:
                    backup_nodes.append(node)
            backup_nodes.sort()

            if list_err in session:
                result = session["err"]
                session.pop("err")
                return render_template(
                    '/admin/replication/add_replication_job.html',
                    backup_nodes=backup_nodes,
                    cluster_name=cluster_name,
                    destination_clusters_list=destination_clusters_list,
                    err=result)

            elif list_success in session:
                result = session["success"]
                session.pop("success")
                return render_template(
                    '/admin/replication/add_replication_job.html',
                    backup_nodes=backup_nodes,
                    cluster_name=cluster_name,
                    destination_clusters_list=destination_clusters_list,
                    success=result)

            elif list_warning in session:
                result = session["warning"]
                session.pop("warning")
                return render_template(
                    '/admin/replication/add_replication_job.html',
                    backup_nodes=backup_nodes,
                    cluster_name=cluster_name,
                    destination_clusters_list=destination_clusters_list,
                    warning=result)

            else:
                return render_template(
                    '/admin/replication/add_replication_job.html',
                    backup_nodes=backup_nodes,
                    cluster_name=cluster_name,
                    destination_clusters_list=destination_clusters_list)

        except CephException as e:
            if e.id == CephException.CONNECTION_TIMEOUT:
                session['err'] = "ui_admin_ceph_time_out"
            elif e.id == CephException.GENERAL_EXCEPTION:
                session['err'] = "ui_admin_ceph_general_exception"
            logger.error(e)
            return redirect(url_for('replication_controller.job_list'))

        except Exception as e:
            session['err'] = "ui_admin_add_job_error"
            logger.error(e)
            return redirect(url_for('replication_controller.job_list'))
Example #13
0
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU Affero General Public License
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.cluster.configuration import configuration
import os
from PetaSAN.core.common.log import logger
from PetaSAN.core.config.api import ConfigAPI

cluster_name = configuration().get_cluster_name()
node_info = configuration().get_node_info()
node_name = node_info.name
nodes = configuration().get_management_nodes_config()
collected_path = ConfigAPI().get_collect_state_dir() + node_name

if not os.path.exists("{}".format(ConfigAPI().get_collect_state_dir())):
    os.system("mkdir {}".format(ConfigAPI().get_collect_state_dir()))
if os.path.exists(collected_path):
    os.system("rm -rf {}".format(collected_path))
if os.path.exists("{}.tar".format(collected_path)):
    os.system("rm -rf {}.tar".format(collected_path))
os.mkdir("{}".format(collected_path))

try:
    for node in nodes:
Example #14
0
    def __acquire_path(self, path, consul_kv):
        if self.__ignored_acquire_paths.get(path):
            logger.info("Ignore forced path {}".format(path))
            return
        logger.debug("Start acquire path {} by node session {}.".format(
            path, self.__session))
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        config = configuration()
        try:
            disk_id, path_index = str(path).split("/")
            pool = self._get_pool_by_disk(disk_id)
            if not pool:
                logger.error('Could not find pool for disk ' + disk_id)
                return
            image_name = self.__image_name_prefix + disk_id
            logger.debug(
                "Start read image meta for acquire path {}.".format(path))
            all_image_meta = ceph_api.read_image_metadata(image_name, pool)
            petasan_meta = all_image_meta.get(
                self.__app_conf.get_image_meta_key())
            disk_meta = DiskMeta()
            disk_meta.load_json(petasan_meta)
            logger.debug(
                "End read image meta for acquire path {}.".format(path))

            logger.debug("Try to acquire path {}.".format(path))
            node_name = config.get_node_name()
            result = consul_api.lock_disk_path(
                self.__app_conf.get_consul_disks_path() + path, self.__session,
                node_name, str(consul_kv.CreateIndex))
            if not result:
                logger.info("Could not lock path {} with session {}.".format(
                    path, self.__session))
            elif result:
                if consul_kv.Value != None and len(str(
                        consul_kv.Value)) > 0 and node_name != str(
                            consul_kv.Value):
                    logger.info("The path {} was locked by {}.".format(
                        path, str(consul_kv.Value)))
                    logger.debug("Node {} will kill node {}.".format(
                        config.get_node_name(), str(consul_kv.Value)))
                    self.__fencing(str(consul_kv.Value))

                # we locked it
                if disk_meta.paths:
                    # if lio has the image name in its backstore already, do not perform rbd mapping
                    if image_name not in self.__backstore:
                        status = ceph_api.map_iamge(image_name, pool)
                    else:
                        status = Status.done
                    if Status.done == status:
                        # Get path info from metadata
                        path_obj = disk_meta.get_paths()[int(path_index) - 1]
                        # add path ips to our network interfaces
                        network_api.add_ip(path_obj.ip, path_obj.subnet_mask,
                                           path_obj.eth, path_obj.vlan_id)
                        #update neighbors arp table
                        network_api.update_neighbors_arp(
                            path_obj.ip, path_obj.eth)
                        # add new target in lio if not there already
                        if not lio_api.is_backstore_image_found(image_name):
                            # Give ceph map image complete it job
                            sleep(3)
                            # Add rbd backstores and target
                            status = lio_api.add_target(
                                disk_meta, disk_meta.pool)
                            """
                            wwn = self.calculate_disk_wwn(disk_meta)
                            status = lio_api.add_target(disk_meta, wwn, disk_meta.pool)
                            """
                        if Status.done == status:
                            # enable the path we locked to true
                            self.__last_acquire_succeeded = True
                            lio_api.enable_path(disk_meta.iqn, path_index,
                                                True)
                            logger.info("Path %s acquired successfully" % path)

                            if self.__acquire_warning_counter > 2:
                                logger.info(
                                    "PetaSAN finally succeeded to acquire path after retrying {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter = 0
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)

                            if path_assignment_info:
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.succeeded)
                        else:
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)
                            if path_assignment_info:
                                logger.info(
                                    "Acquired forced path {}".format(path))
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.failed)
                            self.__last_acquire_succeeded = False
                            if self.__acquire_warning_counter > 2:
                                logger.warning(
                                    "PetaSAN failed to acquire path after {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter += 1
                            logger.error("Error could not acquire path %s" %
                                         path)

                    else:
                        self.__unlock_consul_path(path)

        except Exception as e:
            logger.info("---------------------------------")
            logger.error(str(e.message) + "\n")
            logger.exception(e)

            if str(e.message).find("invalid session") > -1:
                logger.error("Session is invalid")
                try:
                    logger.info("Trying to create new session id")
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)
                    logger.info("New session id is {}".format(self.__session))
                    logger.info("Cleaning all mapped disks from old session")
                    self.__clean()
                except Exception as ex:
                    logger.exception(ex)
            logger.exception("Could not acquire path %s" % path)
            raise e
        logger.debug("End acquire path {}.".format(path))
        return
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.common.enums import BondMode
import sys
from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.cluster.network import Network
from PetaSAN.core.common.cmd import *

logger.debug("Loading network configurations.")
network = Network()

config = configuration()
node = config.get_node_info()
cluster = config.get_cluster_info()

node_management_eth_name = network.get_node_management_interface()
node_management_vlan_id = network.get_node_management_vlan_id()
node_management_eth_ip = network.get_node_management_ip()
node_management_eth_netmask = network.get_node_management_netmask()

gateway = Network().get_def_gateway()
bonds = config.get_cluster_bonds()
jumbo_eths = []
if hasattr(configuration().get_cluster_info(), "jumbo_frames"):
    jumbo_eths = config.get_cluster_info().jumbo_frames

class Service3:
    __cluster_info = configuration().get_cluster_info()
    __node_info = configuration().get_node_info()
    __app_conf = ConfigAPI()
    __session_name = ConfigAPI().get_iscsi_service_session_name()
    __paths_local = set()
    __session = '0'
    __paths_per_disk_local = dict()
    __paths_per_session = dict()
    __total_cluster_paths = 0
    __iqn_tpgs = dict()
    __local_ips = set()
    __backstore = set()
    __current_lock_index = None
    __image_name_prefix = ""
    __cluster_info = configuration().get_cluster_info()
    __node_info = configuration().get_node_info()
    __exception_sleep_time = 0
    __acquire_warning_counter = 0
    __last_acquire_succeeded = True
    __paths_consul_unlocked_firstborn = dict()
    __paths_consul_unlocked_siblings = dict()
    __paths_consul_locked_node = dict()
    __disk_consul_stopped = set()

    is_service_running = False

    def __init__(self):
        if Service3.is_service_running:
            logger.error("The service is already running.")
            raise Exception("The service is already running.")
        Service3.is_service_running = True

    def __del__(self):
        Service3.is_service_running = False

    def __do_process(self):
        self.__paths_local = set()
        self.__paths_per_disk_local = dict()
        self.__paths_per_session = dict()
        self.__iqn_tpgs = dict()
        self.__local_ips = set()
        self.__backstore = set()
        self.__paths_consul_unlocked_firstborn = dict()
        self.__paths_consul_unlocked_siblings = dict()
        self.__paths_consul_locked_node = dict()
        self.__disk_consul_stopped = set()

        self.__read_resources_local()
        self.__read_resources_consul()

        state_change = False

    def __read_resources_local(self):
        logger.debug("Start read local resources.")
        lio_api = LioAPI()
        try:
            self.__backstore = lio_api.get_backstore_image_names()
            self.__iqn_tpgs = lio_api.get_iqns_with_enabled_tpgs()
            for iqn, tpgs in self.__iqn_tpgs.iteritems():
                disk_id = str(iqn).split(":")[1]
                for tpg_index, ips in tpgs.iteritems():
                    self.__paths_local.add("/".join([disk_id, str(tpg_index)]))
                    if ips and len(ips) > 0:
                        for ip in ips:
                            self.__local_ips.add(ip)
        except Exception as e:
            logger.error("Could not read consul resources.")
            raise e
        logger.debug("End read local resources.")

    def __read_resources_consul(self):
        logger.debug("Start read resources consul.")
        self.__paths_per_session = {}
        self.__total_cluster_paths = 0
        unlock_kvs = set()
        self.__paths_consul_locked_node = dict()
        try:
            disk_kvs = ConsulAPI().get_disk_kvs()
            for kv in disk_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                disk_id = str(key).split('/')[0]
                if disk_id in self.__disk_consul_stopped:
                    continue
                if kv.Value == "disk":
                    disk_id = str(key).split('/')[0]
                    self.__paths_per_disk_local[disk_id] = 0
                    if str(kv.Flags) == "1":
                        self.__disk_consul_stopped.add(disk_id)
                    continue
                # Count paths in the cluster.
                self.__total_cluster_paths += 1
                if hasattr(kv, "Session"):
                    disk_id = str(key).split('/')[0]
                    disks = self.__paths_consul_locked_node.get(
                        kv.Session, dict())
                    paths = disks.get(disk_id, 0)
                    disks[disk_id] = paths + 1
                    self.__paths_consul_locked_node[kv.Session] = disks
                    # The count of paths for each session
                    if self.__paths_per_session.has_key(kv.Session):
                        count = self.__paths_per_session.get(kv.Session)
                        self.__paths_per_session[kv.Session] = count + 1
                    else:
                        self.__paths_per_session[kv.Session] = 1
                    if kv.Session == self.__session:
                        self.__paths_consul_locked_node.add(key)
                        disk_paths_count = self.__paths_per_disk_local.get(
                            disk_id, 0) + 1
                        self.__paths_per_disk_local[disk_id] = disk_paths_count
                # unlocked paths
                elif not hasattr(kv, "Session"):
                    unlock_kvs.add(kv)
            # Filter unlocked paths
            for kv in unlock_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                disk_id = str(key).split('/')[0]
                if self.__paths_per_disk_local.get(disk_id, 0) > 0:
                    self.__paths_consul_unlocked_siblings[key] = kv.CreateIndex
                else:
                    self.__paths_consul_unlocked_firstborn[
                        key] = kv.CreateIndex
        except Exception as e:
            logger.error("Could not read consul resources.")
            logger.exception(e)
            raise e
        logger.debug("End read resources consul.")

    def read(self):
        self.__read_resources_consul()
        nodes = ConsulAPI().get_sessions_dict(self.__session_name)
        print "########### First born #########"
        print
        print(self.__paths_consul_unlocked_firstborn)
def get_ceph_volumes_info():
    ceph_volumes_disks = {}
    cluster_fsid = ''
    vg_name = ""
    partitions_uuid = {}
    try:
        cluster_fsid = ceph_disk.get_fsid(configuration().get_cluster_name())
        partitions_uuid = ceph_disk.get_partitions_uuid()
    except Exception as e:
        logger.error(e)
        pass
    cmd = 'ceph-volume --log-path ' + CEPH_VOLUME_LOG_PATH + ' lvm list --format json'
    ret, stdout, stderr = exec_command_ex(cmd)
    if ret != 0:
        if stderr:
            logger.error(stderr)
    if len(stdout) > 0:
        ceph_volumes_info = json.loads(stdout)
        for osd_id, osd_info in ceph_volumes_info.iteritems():
            try:
                ceph_volume_disk_info = CephVolumeInfo()
                fsid = ''
                osd_name = ''
                for element in osd_info:
                    if element['type'] == 'block' or element['type'] == 'data':
                        fsid = element['tags']['ceph.cluster_fsid']
                        if len(fsid) > 0 and fsid != cluster_fsid:
                            continue
                        # if not ['tags']['ceph.cluster_fsid'] or element['tags']['ceph.cluster_fsid'] != cluster_fsid:
                        ceph_volume_disk_info.osd_id = osd_id
                        ceph_volume_disk_info.osd_uuid = element['tags'][
                            'ceph.osd_fsid']
                        if len(element['devices']) > 0:
                            for device in element['devices']:
                                part_name = ceph_disk.get_dev_name(device)
                                osd_name = ceph_disk.get_device_name(part_name)
                                ceph_volume_disk_info.devices.append(osd_name)

                        # if there is no devices (physical disks) exists
                        # get them from get_physical_disks function by volume group name

                        else:
                            vg_name = element['vg_name']
                            lv_name = element['lv_name']
                            ceph_volume_disk_info.lv_name = lv_name
                            ceph_volume_disk_info.vg_name = vg_name
                            physical_list = lvm_lib.get_physical_disks(vg_name)
                            main_devices = list(physical_list["main"])
                            writecache_devices = list(
                                physical_list["writecache"])
                            cache_devices = list(physical_list["dmcache"])

                            if len(main_devices) > 0:
                                for main_dev in main_devices:
                                    main_part_name = ceph_disk.get_dev_name(
                                        main_dev)
                                    main_dev_name = ceph_disk.get_device_name(
                                        main_part_name)
                                    ceph_volume_disk_info.devices.append(
                                        main_dev_name)
                            if len(writecache_devices) > 0:
                                for wcache in writecache_devices:
                                    wcache_partition_name = ceph_disk.get_dev_name(
                                        wcache)
                                    ceph_volume_disk_info.linked_cache_part_num.append(
                                        ceph_disk.get_partition_num(
                                            wcache_partition_name))
                                    ceph_volume_disk_info.linked_cache.append(
                                        wcache_partition_name)
                            elif len(cache_devices) > 0:
                                for cache in cache_devices:
                                    cache_partition_name = ceph_disk.get_dev_name(
                                        cache)
                                    ceph_volume_disk_info.linked_cache_part_num.append(
                                        ceph_disk.get_partition_num(
                                            cache_partition_name))
                                    ceph_volume_disk_info.linked_cache.append(
                                        cache_partition_name)

                        journal_path = ""

                        # if 'ceph.journal_device' in element['tags']:
                        #     journal_path = element['tags']['ceph.journal_device']
                        # if 'ceph.db_device' in element['tags']:
                        #     journal_path = element['tags']['ceph.db_device']
                        uuid = ""

                        # for filestore :
                        if 'ceph.journal_uuid' in element['tags']:
                            uuid = element['tags']['ceph.journal_uuid']

                        # for bluestore :
                        if 'ceph.db_uuid' in element['tags']:
                            uuid = element['tags']['ceph.db_uuid']
                        if len(uuid) > 0 and uuid in partitions_uuid:
                            journal_path = partitions_uuid[uuid]
                        if len(journal_path) > 0:
                            try:
                                if ceph_disk.is_partition(journal_path):
                                    journal_name = get_disk_by_partition(
                                        journal_path)
                                    journal_partition_name = ceph_disk.get_dev_name(
                                        journal_path)
                                    ceph_volume_disk_info.linked_journal_part_num = ceph_disk.get_partition_num(
                                        journal_partition_name)
                                    if len(osd_name
                                           ) > 0 and osd_name in journal_name:
                                        continue
                                    ceph_volume_disk_info.linked_journal = journal_name
                            except Exception as ex:
                                logger.error(ex)
                                continue
            except Exception as e:
                logger.exception(e)
                continue

            for device in ceph_volume_disk_info.devices:
                ceph_volumes_disks.update({device: ceph_volume_disk_info})

    return ceph_volumes_disks
Example #18
0
        self.__dict__ = json.loads(j)


def set_cluster_interface(bonds=[]):
    if (bonds == None or len(bonds) == 0):
        return
    config = configuration()
    cluster_info = config.get_cluster_info()

    cluster_info.bonds = bonds
    config.set_cluster_network_info(cluster_info)
    logger.info("Updated cluster bonds to 1.3 successfully.")


try:
    old_bonds = configuration().get_cluster_bonds()
    new_bonds = []
    if len(old_bonds) > 0:
        for ob in old_bonds:
            if not hasattr(ob, "primary_eth_name"):
                sys.exit(0)
                break

            bond = Bond()
            bond.is_jumbo_frames = ob.is_jumbo_frames
            bond.mode = ob.mode
            bond.name = ob.name
            bond.interfaces = ob.primary_eth_name + "," + ob.other_eths_names
            bond.primary_interface = ob.primary_eth_name
            new_bonds.append(bond)
Example #19
0
subprocess.call('rm -rf /opt/petasan/config/etc/hosts',shell=True)
subprocess.call('touch /opt/petasan/config/etc/hosts',shell=True)
subprocess.call('ln -s /opt/petasan/config/etc/hosts /etc/hosts',shell=True)

subprocess.call('rm -rf /etc/systemd/system/[email protected]/',shell=True)
subprocess.call('rm -rf /etc/systemd/system/[email protected]/',shell=True)
subprocess.call('rm -rf /etc/systemd/system/[email protected]/',shell=True)
os.makedirs("/etc/systemd/system/[email protected]/")
os.makedirs("/etc/systemd/system/[email protected]/")
os.makedirs("/etc/systemd/system/[email protected]/")


content ="[Service]\n\
Environment=CLUSTER={}"
with open("/etc/systemd/system/[email protected]/override.conf", 'w', ) as f:
    f.write(content.format(configuration().get_cluster_name()))
with open("/etc/systemd/system/[email protected]/override.conf", 'w', ) as f:
    f.write(content.format(configuration().get_cluster_name()))
with open("/etc/systemd/system/[email protected]/override.conf", 'w', ) as f:
    f.write(content.format(configuration().get_cluster_name()))


logger.info('End cleaning config files')

#===============================================
logger.info("Starting ceph services")
if not call_cmd("systemctl start ceph.target"):
    logger.error("Can not start ceph services.")

sys.exit(0)
Example #20
0
def update_role(args):
    logger.info("Update roles.")
    config_api = ConfigAPI()
    try:
        cluster_config = configuration()
        node_info = cluster_config.get_node_info()
        is_dirty = False

        if str(args.is_storage) == '-1' and str(args.is_iscsi) == '-1':
            return
        if str(args.is_storage) == "1":
            if not node_info.is_storage:
                logger.info("Update roles 1.")
                node_info.is_storage = True
                cluster_config.update_node_info(node_info)
                is_dirty = True
                logger.info("Update node storage role to true")

        else:
            # ToDO
            pass

        if str(args.is_backup) == "1":
            if not node_info.is_backup:
                node_info.is_backup = True
                cluster_config.update_node_info(node_info)
                is_dirty = True
                logger.info("Update node backup role to true")
        else:
            # ToDO
            pass

        if str(args.is_iscsi) == "1":
            if not node_info.is_iscsi:
                node_info.is_iscsi = True
                cluster_config.update_node_info(node_info)
                is_dirty = True
                logger.info("Update node iscsi role to true")

                path = config_api.get_service_files_path()
                logger.info("Starting PetaSAN service")
                cmd = "python {}{} >/dev/null 2>&1 &".format(
                    path, config_api.get_petasan_service())
                exec_command(cmd)
        else:
            # ToDO
            pass

        if is_dirty:
            consul_base_api = BaseAPI()

            consul_base_api.write_value(
                config_api.get_consul_nodes_path() +
                cluster_config.get_node_name(),
                cluster_config.get_node_info().write_json())
            print 1

    except Exception as ex:
        logger.exception(ex.message)
        print -1

    return
def run(args):
    value = int(args.value)
    cluster_name = configuration().get_cluster_name()

    if value == 1:
        print('backfill_speed choice : very slow')
        ret1, stdout1, stderr1 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_max_backfills 1' --cluster " +
            cluster_name)
        ret2, stdout2, stderr2 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_max_active 1' --cluster "
            + cluster_name)
        ret3, stdout3, stderr3 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_sleep 2' --cluster " +
            cluster_name)
        print('Done changing backfill speed to very slow ...')

    elif value == 2:
        print('backfill_speed choice : slow')
        ret1, stdout1, stderr1 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_max_backfills 1' --cluster " +
            cluster_name)
        ret2, stdout2, stderr2 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_max_active 1' --cluster "
            + cluster_name)
        ret3, stdout3, stderr3 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_sleep 0.7' --cluster "
            + cluster_name)
        print('Done changing backfill speed to slow ...')

    elif value == 3:
        print('backfill_speed choice : medium')
        ret1, stdout1, stderr1 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_max_backfills 1' --cluster " +
            cluster_name)
        ret2, stdout2, stderr2 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_max_active 1' --cluster "
            + cluster_name)
        ret3, stdout3, stderr3 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_sleep 0' --cluster " +
            cluster_name)
        print('Done changing backfill speed to medium ...')

    elif value == 4:
        print('backfill_speed choice : fast')
        ret1, stdout1, stderr1 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_max_backfills 5' --cluster " +
            cluster_name)
        ret2, stdout2, stderr2 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_max_active 3' --cluster "
            + cluster_name)
        ret3, stdout3, stderr3 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_sleep 0' --cluster " +
            cluster_name)
        print('Done changing backfill speed to fast ...')

    elif value == 5:
        print('backfill_speed choice : very fast')
        ret1, stdout1, stderr1 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_max_backfills 7' --cluster " +
            cluster_name)
        ret2, stdout2, stderr2 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_max_active 7' --cluster "
            + cluster_name)
        ret3, stdout3, stderr3 = exec_command_ex(
            "ceph tell osd.* injectargs '--osd_recovery_sleep 0' --cluster " +
            cluster_name)
        print('Done changing backfill speed to very fast ...')

    else:
        print("error : failed to set speed value : No such value {}".format(
            value))

    return
def get_ceph_disk_list():
    disk_info_list = []

    # read fsid for our cluster from config file
    fsid = None
    try:
        fsid = ceph_disk.get_fsid(configuration().get_cluster_name())
    except Exception as e:
        pass

    journal_linked_osds = {}

    counter = 0

    while True:
        try:
            ceph_disk_list_devs = ceph_disk.list_devices()
            break
        except Exception as e:
            if counter == 120:
                return disk_info_list
            counter += 1
            logger.error(e)
            time.sleep(1)

    for device in ceph_disk_list_devs:

        no_of_partitions = 0
        no_of_availabe_partitions = 0

        path = device['path']
        name = ceph_disk.get_dev_name(path)

        # check for disk block devices
        if not name.startswith('sd') and not name.startswith(
                'xvd') and not name.startswith('nvme'):
            continue

        di = DiskInfo()
        disk_info_list.append(di)
        di.name = name
        di.usage = DiskUsage.no

        # check if disk is not partitioned
        if 'partitions' not in device:
            continue

        old_osd = False
        # first check for OSD partitions
        for partition in device['partitions']:
            if partition['ptype'] == ptype_osd and 'whoami' in partition:
                if fsid and partition['ceph_fsid'] == fsid:
                    di.usage = DiskUsage.osd
                    di.osd_id = partition['whoami']
                    di.osd_uuid = partition['uuid']

                    if 'journal_dev' in partition:
                        journal = partition['journal_dev']
                        journal_disk = get_disk_by_partition(journal)
                        if journal_disk != name:
                            di.linked_journal = journal_disk
                            if journal_disk not in journal_linked_osds:
                                journal_linked_osds[journal_disk] = []
                            journal_linked_osds[journal_disk].append(di.name)

                    if 'block.db_dev' in partition:
                        journal = partition['block.db_dev']
                        journal_disk = get_disk_by_partition(journal)
                        if journal_disk != name:
                            di.linked_journal = journal_disk
                            if journal_disk not in journal_linked_osds:
                                journal_linked_osds[journal_disk] = []
                            journal_linked_osds[journal_disk].append(di.name)

                    # do not check further partitons
                    break
                else:
                    old_osd = True

        if di.usage == DiskUsage.osd:
            # go to next disk
            continue

        # check for  journal disk
        if not old_osd:
            no_of_partitions = len(device['partitions'])
            for partition in device['partitions']:
                if partition['ptype'] == ptype_journal or partition[
                        'ptype'] == ptype_blockdb or partition[
                            'ptype'] == journal_avail_ptype:
                    di.usage = DiskUsage.journal

                    if partition['ptype'] == journal_avail_ptype:
                        no_of_availabe_partitions += 1
                    """
                    if 'journal_for' in partition:
                        journal_for = partition['journal_for']
                        journal_for_disk = get_disk_by_partition(journal_for)
                        di.linked_osds.append(journal_for_disk)
                    """
                # check for cache partition
                if partition['ptype'] == cache_used_ptype or partition[
                        'ptype'] == cache_avail_ptype:
                    di.usage = DiskUsage.cache

                    if partition['ptype'] == cache_avail_ptype:
                        no_of_availabe_partitions += 1

        if di.usage == DiskUsage.journal or di.usage == DiskUsage.cache:
            if di.usage == DiskUsage.cache and no_of_partitions > 0:
                di.no_of_partitions = no_of_partitions
                di.no_available_partitions = no_of_availabe_partitions
            # go to next disk
            continue

        # check for mounted partitions
        for partition in device['partitions']:
            if 'mount' in partition:
                mount_path = partition['mount']
                if mount_path is not None and 0 < len(mount_path):
                    di.usage = DiskUsage.mounted
                    # check for system disk
                    if mount_path == '/':
                        di.usage = DiskUsage.system
                        break

    for di in disk_info_list:
        if di.usage == DiskUsage.journal and di.name in journal_linked_osds:
            di.linked_osds = journal_linked_osds[di.name]

    return disk_info_list
def get_full_disk_list(pid=None):
    __output_split_text = "##petasan##"
    disk_list = []
    ceph_disk_list = get_disk_list()
    ph_disk_list = disk_util.get_disk_list()
    osd_dict = None

    try:
        osd_dict = ceph_osd.ceph_osd_tree(configuration().get_node_info().name)
    except Exception as e:
        logger.error(e.message)
    missing_disk_list = []
    # Set osd id and usage

    if ceph_disk_list and len(ceph_disk_list) > 0:
        for disk in ceph_disk_list:
            for ph_disk in ph_disk_list:
                if ph_disk.name == disk.name:
                    ph_disk.usage = disk.usage
                    ph_disk.osd_id = disk.osd_id
                    ph_disk.osd_uuid = disk.osd_uuid
                    ph_disk.linked_journal = disk.linked_journal
                    ph_disk.linked_osds = disk.linked_osds
                    ph_disk.linked_cache = disk.linked_cache
                    ph_disk.linked_cache_part_num = disk.linked_cache_part_num
                    ph_disk.vg_name = disk.vg_name
                    ph_disk.lv_name = disk.lv_name
                    ph_disk.linked_journal_part_num = disk.linked_journal_part_num
                    ph_disk.no_of_partitions = disk.no_of_partitions
                    ph_disk.no_available_partitions = disk.no_available_partitions
                    disk_list.append(ph_disk)
                    break
    else:
        disk_list.extend(ph_disk_list)

    health_test = Smart().get_overall_health()
    for disk in disk_list:
        if disk.name in health_test:
            disk.smart_test = health_test[disk.name]

    # get all running jobs
    job_manager = JobManager()
    job_list = job_manager.get_running_job_list()

    # Set disk osd status
    for node_disk in disk_list:
        # Set osd status [up, down]
        if node_disk.usage == DiskUsage.osd:
            status = None
            if osd_dict and node_disk.osd_id is not None:
                status = osd_dict.get(int(node_disk.osd_id), None)
            if str(ceph_osd.get_osd_id(node_disk.osd_uuid)) == "-1":
                node_disk.status = OsdStatus.no_status
                node_disk.usage = DiskUsage.mounted
                node_disk.osd_id = -1
            elif status is not None:
                node_disk.status = status
            else:
                node_disk.status = OsdStatus.no_status

        disk_name_parameter = "-disk_name {}".format(node_disk.name)
        disk_id_parameter = "-id {}".format(node_disk.osd_id)

        # loop on running job list
        for j in job_list:
            # Set osd status [deleting , adding]
            if j.type == JobType.ADDDISK and str(j.params).find(
                    str(disk_name_parameter)) > -1:
                node_disk.status = OsdStatus.adding
            elif j.type == JobType.ADDJOURNAL and str(j.params).find(
                    str(disk_name_parameter)) > -1:
                node_disk.status = OsdStatus.adding_journal
            elif j.type == JobType.ADDCACHE and str(j.params).find(
                    str(disk_name_parameter)) > -1:
                node_disk.status = OsdStatus.adding_cache
            elif j.type == JobType.DELETEOSD and (
                    str(j.params).find(str(disk_name_parameter)) > -1
                    or str(j.params).find(str(disk_id_parameter)) > -1):
                node_disk.status = OsdStatus.deleting
            elif j.type == JobType.DELETEJOURNAL and str(j.params).find(
                    str(disk_name_parameter)) > -1:
                node_disk.status = OsdStatus.deleting
            elif j.type == JobType.DELETECACHE and str(j.params).find(
                    str(disk_name_parameter)) > -1:
                node_disk.status = OsdStatus.deleting

            # Check if the job completed and has error to return it
            elif pid and j.id == int(pid):
                job_output = job_manager.get_job_output(j)
                if job_output is None:
                    continue
                job_output = str(job_output).strip()
                if job_output != "":
                    # We expect our custom messages appear after __output_split_text.
                    out_arr = job_output.split(__output_split_text)
                    if out_arr > 1:
                        node_disk.error_message = out_arr[1]
                        job_manager.remove_job(j.id)

    if not osd_dict or len(osd_dict.items()) == 0:
        return disk_list
    # If there is an osd found in ceph tree and this osd not has disk.
    for osd_id, osd_status in osd_dict.items():
        is_missing = True
        for disk in disk_list:
            if str(disk.osd_id) == str(osd_id):
                is_missing = False
                break
        if is_missing:
            disk = DiskInfo()
            disk.osd_id = osd_id
            disk.status = osd_status
            disk.usage = DiskUsage.osd
            missing_disk_list.append(disk)
    disk_list.extend(missing_disk_list)

    return disk_list
Example #24
0
class Service:
    __cluster_info = configuration().get_cluster_info()
    __node_info = configuration().get_node_info()
    __app_conf = ConfigAPI()
    __session_name = ConfigAPI().get_iscsi_service_session_name()
    __paths_local = set()
    __session = '0'
    __paths_per_disk_local = dict()
    __paths_per_session = dict()
    __total_cluster_paths = 0
    __iqn_tpgs = dict()
    __local_ips = set()
    __backstore = set()
    __current_lock_index = None
    __image_name_prefix = ""
    __cluster_info = configuration().get_cluster_info()
    __node_info = configuration().get_node_info()
    __exception_retry_timeout = 0
    __failure_timeout = timedelta(minutes=5) + datetime.utcnow()
    __acquire_warning_counter = 0
    __last_acquire_succeeded = True
    __paths_consul_unlocked_firstborn = dict()
    __paths_consul_unlocked_siblings = dict()
    __paths_consul_locked_node = set()
    __disk_consul_stopped = set()
    __ignored_acquire_paths = dict()
    __force_acquire_paths = dict()

    is_service_running = False

    def __init__(self):
        if Service.is_service_running:
            logger.error("The service is already running.")
            raise Exception("The service is already running.")
        Service.is_service_running = True

    def start(self):
        self.__image_name_prefix = self.__app_conf.get_image_name_prefix()
        # Handel the case of cluster has just started
        if self.__node_info.is_management:
            clean_thread = threading.Thread(target=self.handle_cluster_startup)
            clean_thread.start()

        logger.info("Service is starting.")

        keep_resources_flag_path = ConfigAPI().get_keep_resources_flag_path()
        keep_resources_flag = False
        clean = True

        #check if file path exist update keep_resources_flag to be True and remove the file
        if os.path.exists(keep_resources_flag_path):
            keep_resources_flag = True
            os.remove(keep_resources_flag_path)

        # check if no upgrade needed , then get new session
        if not keep_resources_flag:
            try:
                self.__session = ConsulAPI().get_new_session_ID(
                    self.__session_name, self.__node_info.name)
            except Exception as e:
                logger.error(e)
                self.__session = "0"

            if not self.__session or self.__session is None:
                self.__session = "0"

        # check if upgrade needed, then use the current session to keep consul resource
        else:
            keep_resources_flag = False
            try:
                sessions = ConsulAPI().get_sessions_dict(
                    'iSCSITarget', self.__node_info.name)
                if sessions is not None and len(sessions) == 1:
                    consul_session = sessions.values()[0]
                    self.__session = consul_session.ID
                    clean = False
                else:
                    self.__session = "0"

            except Exception as ex:
                logger.error("Could not get Consul sessions")
                logger.exception(ex)
                self.__session = "0"

            if clean:
                self.__clean()

        while True:
            try:
                if self.__session == "0":
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)

                consul_api = ConsulAPI()
                self.__current_lock_index = consul_api.current_index()
                if not self.__current_lock_index:
                    sleep(1)
                    continue
                self.__process()
                old_index = self.__current_lock_index
                self.__current_lock_index = consul_api.watch(
                    self.__current_lock_index)
                if old_index != self.__current_lock_index:
                    # Give a chance to get all changes that occurred in the same time in cosnul.
                    sleep(2)

                self.__exception_retry_timeout = 0
                self.__failure_timeout = timedelta(
                    minutes=self.__app_conf.get_failure_timeout_duration_min(
                    )) + datetime.utcnow()
            except (ConnectionError, RetryConsulException) as ex:
                logger.error("Error on consul connection.")
                logger.exception(ex)
                self.__exception_retry_timeout += 5
            except Exception as ex:
                logger.error("Error during process.")
                logger.exception(ex)
                self.__exception_retry_timeout += 1

            sleep(self.__exception_retry_timeout)
            if self.__exception_retry_timeout > 10:
                logger.warning(
                    "PetaSAN could not complete process, there are too many exceptions."
                )
                self.__exception_retry_timeout = 1
            sleep(self.__exception_retry_timeout)

            # Clean all installed configurations if service did not successfully for 5 minutes.
            if self.__failure_timeout < datetime.utcnow():
                logger.warning(
                    "There are too many exceptions.Service will clean this node."
                )
                self.__clean()
                self.__session = "0"
                self.__failure_timeout = timedelta(
                    minutes=self.__app_conf.get_failure_timeout_duration_min(
                    )) + datetime.utcnow()

    def __process(self):
        logger.debug("Start process, node session id is {}.".format(
            self.__session))
        self.__last_acquire_succeeded = True
        self.__ignored_acquire_paths = dict()
        while self.__do_process() != True:
            pass
        logger.debug("End process.")

    def __do_process(self):
        self.__paths_local = set()
        self.__paths_per_disk_local = dict()
        self.__paths_per_session = dict()
        self.__iqn_tpgs = dict()
        self.__local_ips = set()
        self.__backstore = set()
        self.__paths_consul_unlocked_firstborn = dict()
        self.__paths_consul_unlocked_siblings = dict()
        self.__paths_consul_locked_node = set()
        self.__disk_consul_stopped = set()
        self.__force_acquire_paths = dict()

        self.__read_resources_local()
        self.__read_resources_consul()

        state_change = False

        # ====== Step 1: delete any local paths not locked by us in consul ======
        for path in self.__paths_local:
            if path not in self.__paths_consul_locked_node:
                state_change = True
                self.__clean_local_path(path)

        if state_change:
            logger.info(
                "PetaSAN cleaned local paths not locked by this node in consul."
            )
            return False  # refresh and reprocess

        # ====== Step 2: remove any consul locks we have but not configured locally  ======
        for path in self.__paths_consul_locked_node:
            if path not in self.__paths_local:
                state_change = True
                self.__unlock_consul_path(path)

        if state_change:
            logger.info(
                "PetaSAN unlocked any consul locks not configured in this node."
            )
            return False  # refresh and reprocess

        # ====== Step 3: handle stopped disks  ======
        for disk in self.__disk_consul_stopped:
            self.__stop_disk(disk)

        # ====== Step 4: Clean any unused iqns ======
        if self.__clean_unused_iqns():
            logger.info("PetaSAN cleaned iqns.")
            return False  # refresh and reprocess

        # ====== Step 5: Clean any unused rbd backstores ======
        if self.__clean_unused_rbd_backstore():
            logger.info("PetaSAN Cleaned rbd backstores.")
            return False  # refresh and reprocess

        # ====== Step 6: Clean any unused ips ======
        self.__clean_unused_ips()

        # ====== Step 7: Clean any unused mapped rbd images ======
        self.__clean_unused_rbd_images()

        # ====== Step 8: try to acquire unlocked  paths  ======
        if len(self.__force_acquire_paths) > 0:
            path, value = self.__force_acquire_paths.items()[0]
            if path:
                self.__acquire_path(str(path), value)
                return False

        if len(self.__paths_consul_unlocked_firstborn) > 0:
            path = random.sample(self.__paths_consul_unlocked_firstborn, 1)[0]
            self.__wait_before_lock(path)
            self.__acquire_path(
                str(path), self.__paths_consul_unlocked_firstborn.get(path))
            return False

        if len(self.__paths_consul_unlocked_siblings) > 0:
            path = random.sample(self.__paths_consul_unlocked_siblings, 1)[0]
            self.__wait_before_lock(path)
            self.__acquire_path(
                str(path), self.__paths_consul_unlocked_siblings.get(path))
            return False

        return True

    def __read_resources_local(self):
        logger.debug("Start read local resources.")
        lio_api = LioAPI()
        try:
            self.__backstore = lio_api.get_backstore_image_names()
            self.__iqn_tpgs = lio_api.get_iqns_with_enabled_tpgs()
            for iqn, tpgs in self.__iqn_tpgs.iteritems():
                disk_id = str(iqn).split(":")[1]
                for tpg_index, ips in tpgs.iteritems():
                    self.__paths_local.add("/".join([disk_id, str(tpg_index)]))
                    if ips and len(ips) > 0:
                        for ip in ips:
                            self.__local_ips.add(ip)
        except Exception as e:
            logger.error("Could not read consul resources.")
            raise e
        logger.debug("End read local resources.")

    def __read_resources_consul(self):
        logger.debug("Start read resources consul.")
        self.__paths_per_session = {}
        self.__total_cluster_paths = 0
        unlock_kvs = set()
        consul_api = ConsulAPI()
        try:
            disk_kvs = consul_api.get_disk_kvs()
            for kv in disk_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                disk_id = str(key).split('/')[0]
                if disk_id in self.__disk_consul_stopped:
                    continue
                if kv.Value == "disk":
                    disk_id = str(key).split('/')[0]
                    self.__paths_per_disk_local[disk_id] = 0
                    if str(kv.Flags) == "1":
                        self.__disk_consul_stopped.add(disk_id)
                    continue
                # Count paths in the cluster.
                self.__total_cluster_paths += 1

                if hasattr(kv, "Session"):
                    # locked paths
                    if kv.Session == self.__session:
                        self.__paths_consul_locked_node.add(key)
                        disk_paths_count = self.__paths_per_disk_local.get(
                            disk_id, 0) + 1
                        self.__paths_per_disk_local[disk_id] = disk_paths_count
                    # Total count of paths for each session
                    if self.__paths_per_session.has_key(kv.Session):
                        count = self.__paths_per_session.get(kv.Session)
                        self.__paths_per_session[kv.Session] = count + 1
                    else:
                        self.__paths_per_session[kv.Session] = 1
                # unlocked paths
                elif not hasattr(kv, "Session"):
                    unlock_kvs.add(kv)
            # Filter unlocked paths
            reassignments = None
            if len(unlock_kvs) > 0:
                reassignments = MangePathAssignment().get_forced_paths()
            for kv in unlock_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                if reassignments:
                    path_assignment_info = reassignments.get(key)
                    if path_assignment_info and path_assignment_info.target_node == self.__node_info.name:
                        self.__force_acquire_paths[key] = kv
                        continue
                    else:
                        self.__ignored_acquire_paths[key] = kv
                        continue

                disk_id = str(key).split('/')[0]
                if self.__paths_per_disk_local.get(disk_id, 0) > 0:
                    self.__paths_consul_unlocked_siblings[key] = kv
                else:
                    self.__paths_consul_unlocked_firstborn[key] = kv
        except Exception as e:
            logger.error("Could not read consul resources.")
            logger.exception(e)
            raise e
        logger.debug("End read resources consul.")

    def __clean_local_path(self, path):
        disk_id, path_index = str(path).split("/")
        logger.debug("Start clean disk path {}.".format(path))
        image_name = self.__image_name_prefix + str(disk_id)
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()

        try:

            # Get iqn.
            logger.debug("Start get disk meta to clean path {}.".format(path))
            # iqn = ceph_api.get_disk_meta(disk_id, pool).iqn
            iqn = self._get_iqn_by_disk(disk_id)
            logger.debug("End get disk meta to clean path {}.".format(path))
            # Get tpgs for iqn.
            tpgs = self.__iqn_tpgs.get(iqn, None)
            if not iqn or not tpgs or len(tpgs) == 0:
                logger.info("Could not find ips for %s " % image_name)
            # Remove the assigned ips from our interfaces
            elif tpgs and len(tpgs) > 0:
                # Get assigned ips for each path.
                for tpg, ips in tpgs.iteritems():
                    if tpg == path_index:
                        for ip in ips:
                            logger.debug(
                                "Delete ip {} to clean path {}.".format(
                                    ip, path))
                            if not network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_1_eth_name):
                                network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_2_eth_name)

                        lio_api.disable_path(iqn, path_index)
                        logger.info("Cleaned disk path {}.".format(path))
                        break
        except Exception as e:
            logger.error("Could not clean disk path for %s" % image_name)
            raise e
        logger.debug("End clean disk path {}.".format(path))
        return

    # If all tpgs related to iqn are disable, system will remove iqn.
    def __clean_unused_iqns(self):
        status = False
        lio_api = LioAPI()
        for iqn in lio_api.get_unused_iqns():
            disk_id = str(iqn).split(":")[1]
            image_name = self.__image_name_prefix + str(disk_id)
            lio_api.delete_target(image_name, iqn)
            CephAPI().unmap_image(image_name)
            status = True
            logger.debug("Clean unused iqn {}.".format(iqn))
        return status

    def __clean_unused_rbd_backstore(self):
        status = False
        iqns = self.__iqn_tpgs.keys()
        for rbd_backstore in self.__backstore:
            rbd_backstore_disk_id = str(rbd_backstore).replace(
                self.__image_name_prefix, "")
            is_used = False
            for iqn in iqns:
                disk_id = str(iqn).split(":")[1]
                if disk_id == rbd_backstore_disk_id:
                    is_used = True
                    break
            if not is_used:
                LioAPI().delete_backstore_image(rbd_backstore)
                logger.debug(
                    "Clean unused lio backstore {}.".format(rbd_backstore))
                status = True
        return status

    def __clean_unused_ips(self):
        ips = Network().get_all_configured_ips()
        for ip, eth_name in ips.iteritems():
            ip, netmask = str(ip).split("/")
            if ip not in self.__local_ips and ip != self.__node_info.backend_1_ip and \
                            ip != self.__node_info.backend_2_ip and ip != self.__node_info.management_ip:
                NetworkAPI().delete_ip(ip, eth_name, netmask)
                logger.debug("Clean unused ip {} on interface {}.".format(
                    ip, eth_name))

    def __clean_unused_rbd_images(self):
        ceph_api = CephAPI()
        rbd_images = ceph_api.get_mapped_images()
        if rbd_images is None:
            return
        for image, mapped_count in rbd_images.iteritems():
            if image not in self.__backstore:
                if int(mapped_count) > 0:
                    for i in range(0, int(mapped_count)):
                        ceph_api.unmap_image(image)
                        logger.debug("Unmapped unused image {}.".format(image))

    def __unlock_consul_path(self, path):
        try:
            logger.debug("Unlock {} path locked by session {}.".format(
                path, self.__session))
            consul_api = ConsulAPI()
            consul_api.release_disk_path(
                self.__app_conf.get_consul_disks_path() + path, self.__session,
                None)
            logger.info("Unlock path %s" % path)
        except Exception as e:
            logger.error("Could not unlock path %s" % path)
            raise e

    def __stop_disk(self, disk_id):
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        logger.info("Stopping disk %s" % disk_id)
        image_name = self.__image_name_prefix + str(disk_id)

        try:
            # Get iqn.
            #iqn = ceph_api.get_disk_meta(disk_id, pool).iqn
            iqn = self._get_iqn_by_disk(disk_id)
            # Get tpgs for iqn.
            tpgs = self.__iqn_tpgs.get(iqn, None)
            if not iqn or not tpgs or len(tpgs) == 0:
                logger.error("Could not find ips for %s " % image_name)
            # Remove the assigned ips from our interfaces
            elif tpgs and len(tpgs) > 0:
                # Get assigned ips for each path.
                for tpg, ips in tpgs.iteritems():
                    for ip in ips:
                        if not network_api.delete_ip(
                                ip, self.__cluster_info.iscsi_1_eth_name):
                            network_api.delete_ip(
                                ip, self.__cluster_info.iscsi_2_eth_name)

            lio_api.delete_target(image_name, iqn)
            ceph_api.unmap_image(image_name)
            sleep(2)
            pool = self._get_pool_by_disk(disk_id)
            if not pool:
                logger.error('Could not find pool for disk ' + disk_id)
                return
            if not ceph_api.is_image_busy(image_name, pool):
                consul_api.delete_disk(
                    self.__app_conf.get_consul_disks_path() + disk_id, None,
                    True)
                logger.info(
                    "PetaSAN removed key of stopped disk {} from consul.".
                    format(disk_id))
        except Exception as e:
            logger.info("Could not stop  disk %s" % disk_id)
        return

    def __acquire_path(self, path, consul_kv):
        if self.__ignored_acquire_paths.get(path):
            logger.info("Ignore forced path {}".format(path))
            return
        logger.debug("Start acquire path {} by node session {}.".format(
            path, self.__session))
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        config = configuration()
        try:
            disk_id, path_index = str(path).split("/")
            pool = self._get_pool_by_disk(disk_id)
            if not pool:
                logger.error('Could not find pool for disk ' + disk_id)
                return
            image_name = self.__image_name_prefix + disk_id
            logger.debug(
                "Start read image meta for acquire path {}.".format(path))
            all_image_meta = ceph_api.read_image_metadata(image_name, pool)
            petasan_meta = all_image_meta.get(
                self.__app_conf.get_image_meta_key())
            disk_meta = DiskMeta()
            disk_meta.load_json(petasan_meta)
            logger.debug(
                "End read image meta for acquire path {}.".format(path))

            logger.debug("Try to acquire path {}.".format(path))
            node_name = config.get_node_name()
            result = consul_api.lock_disk_path(
                self.__app_conf.get_consul_disks_path() + path, self.__session,
                node_name, str(consul_kv.CreateIndex))
            if not result:
                logger.info("Could not lock path {} with session {}.".format(
                    path, self.__session))
            elif result:
                if consul_kv.Value != None and len(str(
                        consul_kv.Value)) > 0 and node_name != str(
                            consul_kv.Value):
                    logger.info("The path {} was locked by {}.".format(
                        path, str(consul_kv.Value)))
                    logger.debug("Node {} will kill node {}.".format(
                        config.get_node_name(), str(consul_kv.Value)))
                    self.__fencing(str(consul_kv.Value))

                # we locked it
                if disk_meta.paths:
                    # if lio has the image name in its backstore already, do not perform rbd mapping
                    if image_name not in self.__backstore:
                        status = ceph_api.map_iamge(image_name, pool)
                    else:
                        status = Status.done
                    if Status.done == status:
                        # Get path info from metadata
                        path_obj = disk_meta.get_paths()[int(path_index) - 1]
                        # add path ips to our network interfaces
                        network_api.add_ip(path_obj.ip, path_obj.subnet_mask,
                                           path_obj.eth, path_obj.vlan_id)
                        #update neighbors arp table
                        network_api.update_neighbors_arp(
                            path_obj.ip, path_obj.eth)
                        # add new target in lio if not there already
                        if not lio_api.is_backstore_image_found(image_name):
                            # Give ceph map image complete it job
                            sleep(3)
                            # Add rbd backstores and target
                            status = lio_api.add_target(
                                disk_meta, disk_meta.pool)
                            """
                            wwn = self.calculate_disk_wwn(disk_meta)
                            status = lio_api.add_target(disk_meta, wwn, disk_meta.pool)
                            """
                        if Status.done == status:
                            # enable the path we locked to true
                            self.__last_acquire_succeeded = True
                            lio_api.enable_path(disk_meta.iqn, path_index,
                                                True)
                            logger.info("Path %s acquired successfully" % path)

                            if self.__acquire_warning_counter > 2:
                                logger.info(
                                    "PetaSAN finally succeeded to acquire path after retrying {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter = 0
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)

                            if path_assignment_info:
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.succeeded)
                        else:
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)
                            if path_assignment_info:
                                logger.info(
                                    "Acquired forced path {}".format(path))
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.failed)
                            self.__last_acquire_succeeded = False
                            if self.__acquire_warning_counter > 2:
                                logger.warning(
                                    "PetaSAN failed to acquire path after {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter += 1
                            logger.error("Error could not acquire path %s" %
                                         path)

                    else:
                        self.__unlock_consul_path(path)

        except Exception as e:
            logger.info("---------------------------------")
            logger.error(str(e.message) + "\n")
            logger.exception(e)

            if str(e.message).find("invalid session") > -1:
                logger.error("Session is invalid")
                try:
                    logger.info("Trying to create new session id")
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)
                    logger.info("New session id is {}".format(self.__session))
                    logger.info("Cleaning all mapped disks from old session")
                    self.__clean()
                except Exception as ex:
                    logger.exception(ex)
            logger.exception("Could not acquire path %s" % path)
            raise e
        logger.debug("End acquire path {}.".format(path))
        return

    def __clean(self):
        logger.info("Cleaning unused configurations. ")
        logger.info("Cleaning all mapped disks")
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        # Get tpgs of each iqn
        for iqn, tpgs in lio_api.get_iqns_with_tpgs().iteritems():
            try:
                disk_id = str(iqn).split(":")[1]
                # Get assigned ips for each tpg
                for tpg, ips in tpgs.iteritems():
                    if ips and len(ips) > 0:
                        for ip in ips:
                            # 1- Remove ip from network interface.
                            if not network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_1_eth_name):
                                network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_2_eth_name)

                # 2- Delete iqn ,delete image from rbd backstore and unmap image.
                image_name = self.__image_name_prefix + str(disk_id)
                lio_api.delete_target(image_name, iqn)
                ceph_api.unmap_image(image_name)

            except Exception as e:
                logger.error("Error cleaning all mapped disks, disk %s " %
                             image_name)
                logger.exception(e.message)
        # 3- From backstore
        for image_name in lio_api.get_backstore_image_names():
            try:
                lio_api.delete_backstore_image(image_name)
                ceph_api.unmap_image(image_name)
            except Exception as e:
                logger.error("Error cleaning all mapped disks, disk %s " %
                             image_name)

        logger.info("Cleaning unused rbd images.")
        try:
            self.__clean_unused_rbd_images()
        except:
            logger.error("Error cleaning unused rbd images.")

        logger.info("Cleaning unused ips.")
        try:
            self.__local_ips = set()
            self.__clean_unused_ips()
        except:
            logger.error("Cleaning unused ips.")

    def __wait_before_lock(self, path=None):

        disk_id, path_index = str(path).split("/")
        wait_time = 0
        if path:
            # 1- Calc wait time if path has siblings.
            wait_time = int(self.__app_conf.get_siblings_paths_delay()) * int(
                self.__paths_per_disk_local.get(disk_id, 0))

        logger.debug("Wait time for siblings is {}.".format(wait_time))
        total_nodes = len(ConsulAPI().get_consul_members())
        # 2- Calc average paths per node.
        average_node_paths = float(
            self.__total_cluster_paths) / float(total_nodes)
        # Calc the percent of local paths according to average paths.
        percent = float(self.__paths_per_session.get(self.__session,
                                                     0)) / average_node_paths
        # 3- Calc total wait time
        if self.__last_acquire_succeeded:
            wait_time += int(
                self.__app_conf.get_average_delay_before_lock()) * percent
        else:
            logger.debug("Skipping wait time for average delay.")
        logger.debug(
            "Wait time depending on average and siblings is {}.".format(
                math.ceil(wait_time)))
        sleep(math.ceil(wait_time))

    def __wait_after_lock(self):
        pass

    def __fencing(self, node_name):
        maintenance = ManageMaintenance()
        if maintenance.get_maintenance_config(
        ).fencing == MaintenanceConfigState.off:
            logger.warning(
                "Fencing action will not fire the admin stopped it,the cluster is in maintenance mode."
            )
            return

        node_list = ConsulAPI().get_node_list()
        for node in node_list:

            if str(node.name) == node_name:
                if Network().ping(node.backend_2_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.backend_2_ip))
                    ssh().call_command(node.backend_2_ip, " poweroff ", 5)
                    break
                elif Network().ping(node.management_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.management_ip))
                    ssh().call_command(node.management_ip, " poweroff ", 5)
                    break
                elif Network().ping(node.backend_1_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.backend_1_ip))
                    ssh().call_command(node.backend_1_ip, " poweroff ", 5)
                    break

    def handle_cluster_startup(self):
        i = 0
        consul_api = ConsulAPI()
        logger.debug("Check cluster startup.")
        while True:
            try:

                current_node_name = self.__node_info.name
                result = consul_api.set_leader_startup_time(
                    current_node_name, str(i))
                if i == 0 and not result:
                    sleep(2)
                    continue
                elif result:
                    # value returned, consul is up and running
                    sleep(2)
                    number_of_started_nodes = 0
                    for kv in consul_api.get_leaders_startup_times():
                        node_name = str(kv.Key).replace(
                            ConfigAPI().get_consul_leaders_path(), "")
                        if node_name != current_node_name:
                            if int(kv.Value) == 0:
                                number_of_started_nodes += 1

                    logger.debug("Number of started nodes = {}.".format(
                        number_of_started_nodes))
                    # Another management node is just starting
                    if i == 0 and number_of_started_nodes > 0:
                        logger.info(
                            "Cluster is just starting, system will delete all active disk resources"
                        )
                        consul_api.delete_disk(
                            ConfigAPI().get_consul_disks_path(), recurse=True)
                i += 1
                sleep(58)

            except Exception as ex:
                logger.debug("Start up error")
                logger.exception(ex)
                # maybe other management nodes are starting, give them a chance to start
                if i == 0:
                    sleep(2)
                else:
                    i += 1
                    sleep(58)

    def _get_pool_by_disk(self, disk_id):
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        pool = consul_api.get_disk_pool(disk_id)
        if pool:
            logger.info('Found pool:{} for disk:{} via consul'.format(
                pool, disk_id))
            return pool
        pool = ceph_api.get_pool_bydisk(disk_id)
        if pool:
            logger.info('Found pool:{} for disk:{} via ceph'.format(
                pool, disk_id))
            return pool

        logger.error('Could not find pool for disk ' + disk_id)
        return None

    def _get_iqn_by_disk(self, disk_id):
        for iqn in self.__iqn_tpgs:
            if disk_id == str(iqn).split(":")[1]:
                return iqn
        return None
def prepare_osd(device_name, journal=None):
    config = configuration()
    storage_engine = config.get_cluster_info().storage_engine
    if storage_engine == "filestore":
        return prepare_filestore(device_name, journal)
    return prepare_bluestore(device_name, journal)
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU Affero General Public License
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

import sys
from PetaSAN.core.ceph.deploy.build import mon_status_check
from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.common.log import logger

cluster_conf = configuration()
quorum_size = "0"
if cluster_conf.are_all_mgt_nodes_in_cluster_config():
    try:
        quorum_size = len(mon_status_check().get('quorum'))
        sys.stdout.write(str(quorum_size))

        if int(quorum_size) == 3:
            sys.exit(0)  #  healthy cluster

    except Exception as ex:
        logger.error("Cluster not running")
        sys.stdout.write("-1")
        sys.exit(-1)

else:
def clear_disk(args):
    disk_id = args.disk_id
    image_name = "image-" + disk_id

    try:
        # Get which ceph user is using this function & get his keyring file path #
        # ---------------------------------------------------------------------- #
        ceph_auth = CephAuthenticator()

        config = configuration()
        cluster_name = config.get_cluster_name()

        # Get disk metadata :
        # -------------------
        ceph_api = CephAPI()
        disk_metadata = ceph_api.get_diskmeta(disk_id)

        # Get pool name :
        # ---------------
        pool_name = disk_metadata.pool
        data_pool = ""

        # Check if disk has been created on replicated pool or erasure pool :
        # -------------------------------------------------------------------
        if len(disk_metadata.data_pool) > 0:
            data_pool = disk_metadata.data_pool

        tmp_image_name = "tmp_disk_" + disk_metadata.id

        # (1.) Check if a previous tmp image for this disk is still existed :
        # ===================================================================
        images_list = ceph_api.get_all_images(pool_name)

        for image in images_list:
            if tmp_image_name in image:
                # Delete image #
                cmd = "rbd rm {}/{} {} --cluster {}".format(
                    pool_name, image, ceph_auth.get_authentication_string(),
                    cluster_name)
                if not call_cmd(cmd):
                    print(
                        "Error : clear_disk.py script : cannot remove tmp image ,\ncmd : "
                        + cmd)
                    sys.exit(-1)

        print(
            "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)"
        )
        logger.info(
            "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)"
        )

        # (2.) Stop old disk :
        # ====================
        consul_api = ConsulAPI()
        kv = consul_api.find_disk(disk_id)
        if kv is not None:
            manage_disk = ManageDisk()
            status = manage_disk.stop(disk_id)

            if status != Status.done:
                print('Error : Cannot stop disk , id = ' + disk_id)
                sys.exit(-1)

            print("Stage 2 :\n\tStop old disk > (Completed)")
            logger.info("Stage 2 :\n\tStop old disk > (Completed)")
            time.sleep(3)

            # (3.) Check if old disk is stopped or not :
            # ==========================================
            if len(data_pool) > 0:
                pool_type = "erasure"
                _confirm_disk_stopped(data_pool, disk_id, pool_type)
            else:
                pool_type = "replicated"
                _confirm_disk_stopped(pool_name, disk_id, pool_type)

            print(
                "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)"
            )
            logger.info(
                "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)"
            )

        else:
            print("Stage 2 :\n\tStop old disk > (Completed)")
            logger.info("Stage 2 :\n\tStop old disk > (Completed)")

            print(
                "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)"
            )
            logger.info(
                "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)"
            )
            print('\tclear_disk.py script : disk {} is already stopped'.format(
                disk_id))

        # (4.) Create a tmp image (not PetaSAN image) :
        # =============================================
        # Generate a random value between 1 and 99999 #
        random_no = str(random.randint(1, 100000))
        tmp_image_name = tmp_image_name + "_" + str(random_no)
        image_size = disk_metadata.size * 1024

        if len(data_pool) > 0:
            cmd = "rbd create {}/{} --size {} --data-pool {} {} --cluster {}".format(
                pool_name, tmp_image_name, image_size, data_pool,
                ceph_auth.get_authentication_string(), cluster_name)
        else:
            cmd = "rbd create {}/{} --size {} {} --cluster {}".format(
                pool_name, tmp_image_name, image_size,
                ceph_auth.get_authentication_string(), cluster_name)

        if not call_cmd(cmd):
            print(
                "Error : clear_disk.py script : cannot create new tmp image ,\ncmd : "
                + cmd)
            sys.exit(-1)

        print("Stage 4 :\n\tCreate a tmp image called ( " + tmp_image_name +
              " ) > (Completed)")
        logger.info("Stage 4 :\n\tCreate a tmp image called ( " +
                    tmp_image_name + " ) > (Completed)")

        # (5.) Run script to copy "old disk" metadata to new "tmp_disk" :
        # ===============================================================
        metadata_script_file = ConfigAPI().get_disk_meta_script_path()

        # Function : read_disks_metadata :
        parser_key_1 = "read"
        arg_1 = "--image"
        arg_2 = "--pool"

        # Function : set_disk_metadata :
        parser_key_2 = "write"
        arg_3 = "--file"

        cmd = metadata_script_file + " " + parser_key_1 + " " + arg_1 + " " + image_name + " " + arg_2 + " " + pool_name +\
              " | " + metadata_script_file + " " + parser_key_2 + " " + arg_1 + " " + tmp_image_name + " " + arg_2 + " " + pool_name

        if not call_cmd(cmd):
            print(
                "Error : clear_disk.py script : cannot copy metadata from old disk to new tmp image ,\ncmd : "
                + cmd)
            sys.exit(-1)

        print(
            "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)"
        )
        logger.info(
            "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)"
        )

        time.sleep(3)

        # (6.) Remove metadata of old disk :
        # ===========================================================
        old_image_name = str(ceph_api.conf_api.get_image_name_prefix() +
                             disk_metadata.id)
        confirm = ceph_api.remove_disk_metadata(old_image_name,
                                                disk_metadata.pool)

        if not confirm:
            print(
                "Error : clear_disk.py script : cannot remove metadata of old disk"
            )
            # sys.exit(-1)

        print("Stage 6 :\n\tRemove metadata of old disk > (Completed)")
        logger.info("Stage 6 :\n\tRemove metadata of old disk > (Completed)")

        # (7.) Rename old disk image name with "deleted-" + disk_id + random_no:
        # ======================================================================
        new_image_name = "deleted-" + disk_metadata.id + "-" + random_no
        cmd = "rbd mv {}/{} {} {} --cluster {}".format(
            pool_name, image_name, new_image_name,
            ceph_auth.get_authentication_string(), cluster_name)
        if not call_cmd(cmd):
            print(
                "Error : clear_disk.py script : cannot rename old image from {} to {} ,\ncmd : {}"
                .format(image_name, new_image_name, cmd))
            sys.exit(-1)

        print("Stage 7 :\n\tRename old disk image name with ( " +
              new_image_name + " ) > (Completed)")
        logger.info("Stage 7 :\n\tRename old disk image name with ( " +
                    new_image_name + " ) > (Completed)")

        time.sleep(5)

        # (8.) Rename "tmp_disk" with old disk image name :
        # =================================================
        cmd = "rbd mv {}/{} {} {} --cluster {}".format(
            pool_name, tmp_image_name, image_name,
            ceph_auth.get_authentication_string(), cluster_name)
        if not call_cmd(cmd):
            print(
                "Error : clear_disk.py script : cannot rename \"tmp_disk\" from {} to {} ,\ncmd : {}"
                .format(tmp_image_name, image_name, cmd))
            sys.exit(-1)

        print(
            "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)"
        )
        logger.info(
            "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)"
        )

        time.sleep(5)

        jm = JobManager()
        id = jm.add_job(JobType.DELETE_DISK, new_image_name + ' ' + pool_name)

        print("Stage 9 :\n\tStart a job to remove old disk image , job id = " +
              str(id))
        logger.info(
            "Stage 9 :\n\tStart a job to remove old disk image , job id = " +
            str(id))

        sys.exit(0)

    except PoolException as e:
        print("Error : PoolException , {}".format(e.message))
        logger.error("Clear Disk Error : PoolException , {}".format(e.message))
        sys.exit(-1)

    except DiskListException as e:
        print("Error : DiskListException , {}".format(e.message))
        logger.error("Clear Disk Error : DiskListException , {}".format(
            e.message))
        sys.exit(-1)

    except CephException as e:
        if e.id == CephException.GENERAL_EXCEPTION:
            print("Error : CephException , {}".format(e.message))
        logger.error("Clear Disk Error : CephException , {}".format(e.message))
        sys.exit(-1)

    except MetadataException as e:
        print("Error : MetadataException , {}".format(e.message))
        logger.error("Clear Disk Error : MetadataException , {}".format(
            e.message))
        sys.exit(-1)

    except Exception as e:
        print("Error : Exception , {}".format(e.message))
        logger.error("Clear Disk Error : Exception , {}".format(e.message))
        sys.exit(-1)
 modify it under the terms of the GNU Affero General Public License
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.common.cmd import *
from PetaSAN.core.entity.cluster import NodeInfo
import json
from PetaSAN.core.config.api import ConfigAPI

config = configuration()
node = config.get_node_info()
cluster = config.get_cluster_info()

conf = configuration()
cluster_info = conf.get_cluster_info()

first_cluster_node = cluster_info.management_nodes[0]
first_node_info = NodeInfo()
first_node_info.load_json(json.dumps(first_cluster_node))

second_cluster_node = cluster_info.management_nodes[1]
second_node_info = NodeInfo()
second_node_info.load_json(json.dumps(second_cluster_node))

call_cmd('python' + ConfigAPI().get_consul_start_up_script_path() +
 def __init__(self):
     self.node_name = configuration().get_node_name()
     self.max_limit = self.MAX_SEND_COUNT
     self.metrics_list = []
Example #30
0

def __get_pre_config_disks():
    disks = PreConfigStorageDisks()

    try:
        with open(ConfigAPI().get_node_pre_config_disks(), 'r') as f:
            data = json.load(f)
            disks.load_json(json.dumps(data))
            return disks
    except:
        return disks


# print subprocess.call("ceph-disk prepare --cluster ceph --zap-disk --fs-type xfs /dev/sdj /dev/sdh",shell=True)
cluster_name = configuration().get_cluster_name()
status = StatusReport()

status.success = False

try:
    cm = CacheManager()
    node_name = configuration().get_node_info().name
    storage_engine = configuration().get_cluster_info().storage_engine
    if configuration().get_node_info().is_storage:
        disks = __get_pre_config_disks()

        if len(disks.journals) > 0:
            for d in disks.journals:
                ceph_disk_lib.clean_disk(d)
                add_journal(d)