Beispiel #1
0
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

import os
import sys
from PetaSAN.core.cluster.configuration import configuration
from PetaSAN.core.common.log import logger
from PetaSAN.core.config.api import ConfigAPI
from PetaSAN.core.common.cmd import call_cmd

cluster_name = configuration().get_cluster_info().name
ceph_mon_keyring = ConfigAPI().get_ceph_mon_keyring(cluster_name)
ceph_client_admin_keyring = ConfigAPI().get_ceph_keyring_path(cluster_name)

try:

    cluster_conf = configuration()
    current_node_info = cluster_conf.get_node_info()
    current_node_name = current_node_info.name
    current_cluster_info = cluster_conf.get_cluster_info()
    config_api = ConfigAPI()

    os.makedirs("/var/lib/ceph/mon/{}-{}".format(cluster_name,
                                                 current_node_name))

    os.makedirs("/tmp/{}".format(current_node_name))
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from itertools import starmap
import os
from PetaSAN.core.config.api import ConfigAPI

is_read = False
old_msgs = dict()
new_msgs = dict()
new_megs_dis = dict()
list = []
message_path = ConfigAPI().get_messages_file_path()
message_path_description = message_path.replace(".txt", "_description.txt")


def read():
    global is_read
    print "file is reading now."
    if not os.path.exists(message_path) or not os.path.exists(
            message_path_description):
        raise Exception("Messages files not exists.")

    with open(message_path_description, 'r') as f:
        for line in f.read().splitlines():
            if len(line.strip()) == 0:
                continue
            line_description = line.split("#")
def startup_services(building_stage=False, cluster_complete=False):
    path = ConfigAPI().get_service_files_path()

    if not building_stage and cluster_complete:
        logger.info("Start settings IPs")
        call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path())

        call_cmd('systemctl start ntp')
        call_cmd('systemctl start petasan-mount-sharedfs')
        NTPConf().force_ntp_sync()
        JobManager().remove_jobs_since(0)

        if cluster_config.get_node_info().is_management:
            call_cmd('python ' + ConfigAPI().get_consul_start_up_script_path())
            call_cmd('systemctl start glusterfs-server')
            call_cmd('systemctl start petasan-cluster-leader')
        else:
            call_cmd('python ' +
                     ConfigAPI().get_consul_client_start_up_script_path())

        logger.info("Starting cluster file sync service")
        call_cmd('systemctl start petasan-file-sync')

        call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh')
        if cluster_config.get_node_info().is_iscsi:
            logger.info("Starting iSCSI Service")
            call_cmd('systemctl start petasan-iscsi')

        if cluster_config.get_node_info().is_management:
            logger.info("Starting Cluster Management application")
            call_cmd('systemctl start petasan-admin')
            # create Ceph manager if not already created
            #exec_command('python /opt/petasan/scripts/create_mgr.py 60 >/dev/null 2>&1 &')

        logger.info("Starting Node Stats Service")
        call_cmd('systemctl start petasan-node-stats')

        logger.info("Starting OSDs")
        call_cmd('systemctl restart petasan-start-osds')

        if cluster_config.get_node_info().is_backup:
            logger.info('Starting sync replication node service')
            call_cmd('systemctl restart petasan-sync-replication-node')

    elif building_stage:

        call_cmd('systemctl start petasan-mount-sharedfs')
        if cluster_config.get_node_info().is_management:
            call_cmd('systemctl start petasan-cluster-leader')

        logger.info("Starting cluster file sync service")
        call_cmd('systemctl start petasan-file-sync')

        call_cmd('/opt/petasan/scripts/load_iscsi_mods.sh')
        if cluster_config.get_node_info().is_iscsi:
            logger.info("Starting PetaSAN service")
            call_cmd('systemctl start petasan-iscsi')
            sleep(2)

        if cluster_config.get_node_info().is_management:
            logger.info("Starting Cluster Management application")
            call_cmd('systemctl start petasan-admin')

        logger.info("Starting Node Stats Service")
        call_cmd('systemctl start petasan-node-stats')

        logger.info("Starting OSDs")
        call_cmd('systemctl restart petasan-start-osds')

    elif not building_stage and not cluster_complete:
        logger.info("Start settings IPs")
        call_cmd('python ' + ConfigAPI().get_node_start_ips_script_path())
Beispiel #4
0
    def replace(self, ip, password):
        config = configuration()
        ssh_obj = ssh()
        config_api = ConfigAPI()
        logger.info("Starting replace.")
        if os.path.exists(config_api.get_cluster_info_file_path()):
            os.remove(config_api.get_cluster_info_file_path())

        if ssh_obj.copy_public_key_from_host(ip, password):
            logger.info("Successfully copied public keys.")
            if ssh_obj.copy_private_key_from_host(ip, password):
                ssh_obj.create_authorized_key_file()
                logger.info("Successfully copied private keys.")

        else:
            raise SSHKeyException("Error copying keys")

        out, err = ssh_obj.exec_command(
            ip,
            "python {}".format(config_api.get_cluster_status_for_join_path()))
        out = int(out)
        if out == -1:
            raise ReplaceException("core_deploy_replace_mon_not_healthy_err")
        elif out == 0:
            raise ReplaceException(
                "core_deploy_replace_cluster_in_progress_err")
        elif out == 1:
            raise ReplaceException(
                "core_deploy_replace_two_management_node_down_err")
        elif out == 3:
            raise ReplaceException("core_deploy_replace_cluster_running_err")

        if not os.listdir(
                os.path.dirname(config_api.get_cluster_info_file_path())):
            os.makedirs(
                os.path.dirname(config_api.get_cluster_info_file_path()))

        logger.info("Starting to copy config file")
        if not ssh_obj.copy_file_from_host(
                ip, config_api.get_cluster_info_file_path()):
            raise Exception("Error copying  config file")

        logger.info("Successfully copied config file.")
        cluster_name = config.get_cluster_name(True)
        logger.info("Successfully joined to cluster {}".format(cluster_name))

        wrong_name = True
        wrong_ip = True
        for node_info in config.get_management_nodes_config():
            if node_info.name == config.get_node_name(
            ) or node_info.management_ip == Network().get_node_management_ip():
                if node_info.name == config.get_node_name():
                    wrong_name = False

                if node_info.management_ip == Network().get_node_management_ip(
                ):
                    wrong_ip = False

                if not wrong_name and not wrong_ip:
                    config.set_node_info(node_info, True)
                    open(config_api.get_replace_file_path(), 'w+').close()
                break

        if wrong_name and wrong_ip:
            os.remove(config_api.get_cluster_info_file_path())
            raise ReplaceException("core_deploy_replace_node_do_not_match_err")
        elif wrong_name:
            os.remove(config_api.get_cluster_info_file_path())
            raise ReplaceException(
                "core_deploy_replace_node_do_not_match_name_err")
        elif wrong_ip:
            os.remove(config_api.get_cluster_info_file_path())
            raise ReplaceException(
                "core_deploy_replace_node_do_not_match_ip_err")

        config.set_password(password)
        logger.info("password set successfully.")
        self.__copy_current_tunings(ip)
        return cluster_name
Beispiel #5
0
    def process(self):
        logger.info("Start process reassignments paths.")
        max_retry = 100
        current_reassignments = self.get_current_reassignment()
        config = configuration()
        assignment_script_path = ConfigAPI().get_assignment_script_path()
        if current_reassignments is None:
            return
        for ip, path_assignment_info in current_reassignments.iteritems():
            logger.info("process path {} and its status is {}".format(
                ip, path_assignment_info.status))
            if path_assignment_info.status == ReassignPathStatus.pending:
                logger.info(
                    "Move action,try clean disk {} path {} remotely on node {}."
                    .format(path_assignment_info.disk_name,
                            path_assignment_info.disk_id,
                            path_assignment_info.node))

                status = False
                try:

                    cmd = "python {} path_host -ip {} -disk_id {}".format(
                        assignment_script_path, path_assignment_info.ip,
                        path_assignment_info.disk_id)
                    out, err = ssh().exec_command(path_assignment_info.node,
                                                  cmd)
                    logger.info(cmd)
                    # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id)
                except Exception as ex:
                    logger.exception(ex.message)
                    out = ""

                if str(out).strip() == "0":
                    logger.info("Move action passed")
                    status = True

                current_path_assignment_info = None
                if status:
                    for i in xrange(0, max_retry):
                        logger.debug(
                            "Wait to update status of path {}.".format(
                                path_assignment_info.ip))
                        sleep(0.25)
                        reassignments = self.get_current_reassignment()
                        if reassignments:
                            current_path_assignment_info = reassignments.get(
                                path_assignment_info.ip)
                            if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                                continue
                            else:
                                logger.info(
                                    "Process completed for path {} with status {}."
                                    .format(
                                        current_path_assignment_info.ip,
                                        current_path_assignment_info.status))
                                break
                    if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                        self.update_path(current_path_assignment_info,
                                         ReassignPathStatus.failed)
                        logger.info(
                            "Move action,failed ,disk {} path {}.".format(
                                path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))

                else:
                    self.update_path(path_assignment_info,
                                     ReassignPathStatus.failed)
                    logger.info(
                        "Move action ,failed to clean disk {} path {} remotely on node ."
                        .format(path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))
        sleep(10)  # wait for display status to user if needed
        logger.info("Process completed.")
        self.remove_assignment()
        ConsulAPI().drop_all_node_sessions(
            self.__app_conf.get_consul_assignment_path(),
            config.get_node_name())
def __start_leader_locally():
    PetaSAN.core.common.cmd.call_cmd(
        'python ' + ConfigAPI().get_consul_start_up_script_path())
    return
def build_monitors():
    cluster_name = configuration().get_cluster_name()
    ceph_mon_keyring = ConfigAPI().get_ceph_mon_keyring(cluster_name)
    ceph_client_admin_keyring = ConfigAPI().get_ceph_keyring_path(cluster_name)
    status = StatusReport()

    try:
        _fsid = uuid.uuid4()

        content = "[global]\n\
fsid = {fsid}\n\
mon_host = {mon_host}\n\
\n\
public_network = {public_network}\n\
cluster_network = {cluster_network}\n\
\n"

        cluster_config = configuration()
        current_node_info = cluster_config.get_node_info()

        current_node_name = current_node_info.name
        current_cluster_info = cluster_config.get_cluster_info()

        config_api = ConfigAPI()
        mon_hosts_backend_ip = []
        remote_mons_management_ips = []

        for i in current_cluster_info.management_nodes:
            node_info = NodeInfo()
            node_info.load_json(json.dumps(i))
            mon_hosts_backend_ip.append(node_info.backend_1_ip)
            if current_node_name != node_info.name:
                remote_mons_management_ips.append(node_info.management_ip)

        if not os.path.exists(config_api.get_cluster_ceph_dir_path()):
            os.makedirs(os.path.dirname(
                config_api.get_cluster_ceph_dir_path()))

        with open(
                config_api.get_cluster_ceph_dir_path() +
                "{}.conf".format(cluster_name),
                'w',
        ) as f:
            f.write(
                content.format(
                    fsid=_fsid,
                    public_network=str(
                        current_cluster_info.backend_1_base_ip) + "/" +
                    __get_net_size(str(current_cluster_info.backend_1_mask)),
                    cluster_network=str(
                        current_cluster_info.backend_2_base_ip) + "/" +
                    __get_net_size(str(current_cluster_info.backend_2_mask)),
                    mon_initial=cluster_config.get_node_name(),
                    mon_host=cluster_config.get_node_info().backend_1_ip +
                    ',' + ','.join(mon_hosts_backend_ip)) +
                cluster_config.get_ceph_tunings() + "\n")

        if not call_cmd(
                "ceph-authtool --create-keyring /tmp/{} --gen-key -n mon. --cap mon 'allow *'"
                .format(ceph_mon_keyring)):
            logger.error(
                "ceph-authtool --create-keyring for mon returned error")
            status.success = False

        # elif not call_cmd("".join(["ceph-authtool --create-keyring {}".format(ceph_client_admin_keyring),
        #                    " --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'"])) :
        # Nautilius remove --set-uid=0

        elif not call_cmd("".join([
                "ceph-authtool --create-keyring {}".format(
                    ceph_client_admin_keyring),
                " --gen-key -n client.admin --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'"
        ])):
            logger.error(
                "ceph-authtool --create-keyring for admin returned error")
            status.success = False

        elif not call_cmd("ceph-authtool /tmp/{} --import-keyring {}".format(
                ceph_mon_keyring, ceph_client_admin_keyring)):
            logger.error("ceph-authtool --import-keyring returned error")
            status.success = False

        elif not call_cmd(
                "monmaptool --create --add {} {} --fsid {} /tmp/monmap".format(
                    cluster_config.get_node_name(),
                    cluster_config.get_node_info().backend_1_ip, _fsid)):
            logger.error("monmaptool --create --add returned error")
            status.success = False

        if not os.path.exists("/var/lib/ceph/mon/{}-{}".format(
                cluster_name, current_node_name)):
            os.makedirs("/var/lib/ceph/mon/{}-{}".format(
                cluster_name, current_node_name))

        if not status.success or not call_cmd(
                "ceph-mon --cluster {} --mkfs -i {} --monmap /tmp/monmap --keyring /tmp/{}"
                .format(cluster_name, current_node_name, ceph_mon_keyring)):
            logger.error("ceph-mon --mkfs --add returned error")
            status.success = False

        open(
            "/var/lib/ceph/mon/{}-{}/done".format(cluster_name,
                                                  current_node_name),
            'w+').close()
        open(
            "/var/lib/ceph/mon/{}-{}/systemd".format(cluster_name,
                                                     current_node_name),
            'w+').close()

        call_cmd("chown -R ceph:ceph /var/lib/ceph/mon")

        call_cmd("systemctl enable ceph.target ")
        call_cmd("systemctl enable ceph-mon.target ")
        call_cmd("systemctl enable ceph-mon@{} ".format(current_node_name))
        if not status.success or not call_cmd(
                "systemctl start ceph-mon@{}  ".format(current_node_name)):
            status.success = False

        if not status.success:
            status.failed_tasks.append(
                "Create ceph mon on {} returned error.".format(
                    current_node_name))
            return status

        logger.info("First monitor started successfully")

        # create local manager :
        call_cmd('/opt/petasan/scripts/create_mgr.py')

        logger.info("Starting to deploy remote monitors")

        # call_cmd("ceph-create-keys --cluster {} -i {}  ".format(cluster_name,current_node_name))
        # Nautilius copy bootstrap-osd ourselves
        if not os.path.exists("/var/lib/ceph/bootstrap-osd/"):
            os.makedirs("/var/lib/ceph/bootstrap-osd/")
            call_cmd(
                'ceph auth get client.bootstrap-osd > /var/lib/ceph/bootstrap-osd/ceph.keyring'
            )

        for remote_mon in remote_mons_management_ips:
            ssh_obj = ssh()
            if not ssh_obj.copy_file_to_host(
                    remote_mon, "{}".format(ceph_client_admin_keyring)):
                logger.error("Cannot copy {} to {}".format(
                    ceph_client_admin_keyring, remote_mon))
                status.success = False
            elif not ssh_obj.copy_file_to_host(
                    remote_mon, "/etc/ceph/{}.conf".format(cluster_name)):
                logger.error("Cannot copy ceph.conf to {}".format(remote_mon))
                status.success = False
            elif not ssh_obj.call_command(
                    remote_mon, " python {} ".format(
                        config_api.get_node_create_mon_script_path())):
                logger.error("Cannot create monitor on remote node {}".format(
                    remote_mon))
                status.success = False

            # Nautilius copy bootstrap-osd ourselves :
            elif not ssh_obj.call_command(
                    remote_mon, 'mkdir -p /var/lib/ceph/bootstrap-osd'):
                logger.error(
                    "Cannot create bootstrap-osd dir on remote node {}".format(
                        remote_mon))
                status.success = False
            elif not ssh_obj.copy_file_to_host(
                    remote_mon, '/var/lib/ceph/bootstrap-osd/ceph.keyring'):
                logger.error("Cannot copy bootstrap-osd keyring to {}".format(
                    remote_mon))
                status.success = False

            if not status.success:
                status.failed_tasks.append(
                    "core_cluster_deploy_monitor_create_err" + "%" +
                    remote_mon)
                return status
        if not __test_mons():
            status.success = False
            status.failed_tasks.append("core_cluster_deploy_monitors_down_err")
            return status

        # Nautilius enable msgr2 :
        call_cmd('ceph mon enable-msgr2')

    except Exception as ex:
        status.success = False
        logger.exception(ex.message)
        status.failed_tasks.append(
            "core_cluster_deploy_monitor_exception_occurred" + "%" +
            current_node_name)
        return status

    status.success = True
    return status
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.cluster.configuration import configuration
import os
from PetaSAN.core.common.log import logger
from PetaSAN.core.config.api import ConfigAPI



cluster_name = configuration().get_cluster_name()
node_info = configuration().get_node_info()
node_name = node_info.name
nodes = configuration().get_management_nodes_config()
collected_path = ConfigAPI().get_collect_state_dir()+node_name


if not os.path.exists("{}".format(ConfigAPI().get_collect_state_dir())):
    os.system("mkdir {}".format(ConfigAPI().get_collect_state_dir()))
if os.path.exists(collected_path):
    os.system("rm -rf {}".format(collected_path))
if os.path.exists("{}.tar".format(collected_path)):
    os.system("rm -rf {}.tar".format(collected_path))
os.mkdir("{}".format(collected_path))


try:
    for node in nodes:
        if node.name == node_info.name:
            continue
 def get_disk_list(self):
     file_path = ConfigAPI().get_manage_node_disk_script()
     cmd = "python {} disk-list -pid 1".format(file_path)
     out, err = exec_command(cmd)
     data = json.loads(out)
     return data
Beispiel #10
0
def new():
    conf = configuration()
    current_node_name = conf.get_node_info().name
    clu = conf.get_cluster_info()

    logger.info('Creating new cluster named %s', clu.name)
    cfg = CephConf()
    cfg.add_section('global')

    fsid = uuid.uuid4()
    cfg.set('global', 'fsid', str(fsid))


    # if networks were passed in, lets set them in the
    # global section

    cfg.set('global', 'public network', str(clu.backend_1_base_ip)+"/"+get_net_size(str(clu.backend_1_mask)))

    cfg.set('global', 'cluster network', str(clu.backend_2_base_ip)+"/"+get_net_size(str(clu.backend_2_mask)))

    mon_initial_members = []
    mon_host = []



    config_api = ConfigAPI()
    for i in clu.management_nodes:
        node_info=NodeInfo()
        node_info.load_json(json.dumps(i))
        mon_initial_members.append(node_info.name)
        mon_host.append(node_info.backend_1_ip)






    cfg.set('global', 'mon initial members', ', '.join(mon_initial_members))
    # no spaces here, see http://tracker.newdream.net/issues/3145
    cfg.set('global', 'mon host', ','.join(mon_host))

    # override undesirable defaults, needed until bobtail

    # http://tracker.ceph.com/issues/6788
    cfg.set('global', 'auth cluster required', 'cephx')
    cfg.set('global', 'auth service required', 'cephx')
    cfg.set('global', 'auth client required', 'cephx')

    cfg.set('global', 'mon clock drift allowed', '.300')
    cfg.set('global', 'osd pool default size', '2')
    cfg.set('global', 'max open files', '131072')

    # http://tracker.newdream.net/issues/3138
    cfg.set('global', 'filestore xattr use omap', 'true')

    path = '{name}.conf'.format(
        name=clu.name,
        )

    new_mon_keyring(clu.name)

    logger.info('Writing initial config to %s...', path)
    tmp = '%s.tmp' % path
    with file(tmp, 'w') as f:
        cfg.write(f)
    try:
        os.rename(tmp, path)
    except OSError as e:
           raise
Beispiel #11
0
 def update_neighbors_arp(self, ip, eth):
     eth_name = self.get_eth_name(ip)
     if eth_name is not None and "." in eth_name:
         eth = eth_name
     call_cmd("python " + ConfigAPI().get_arping_script_path() +
              " -ip {} -eth {} &".format(ip, eth))
    def add_osd(self,
                node_name,
                disk_name,
                journal=None,
                cache=None,
                cache_type="disabled"):
        """
        :param node_name:
        :param disk_name:
        :param journal:
        :param cache:
        :param cache_type:
        :return: it will return pid number, so pid will use to track the error message if occurred ,
        if it returns -1 , this means : core_manage_node_add_osd_err
        """
        # Journal value will be :
        #   -   None : if no journal exist ,
        #   -   disk_name : if user selected a journal or
        #   -   auto : if user did not select journal

        ssh_obj = ssh()
        cmd = ""
        # ---------------------------------------------------------------------- #
        if journal:
            if journal != "auto":
                if not self.is_journal_space_avail(node_name,
                                                   str(journal).lower()):
                    raise DiskException(
                        DiskException.JOURNAL_NO_SPACE,
                        'There is no disk space for a new OSD with journal.')

            if not self.has_valid_journal(node_name):
                raise DiskException(
                    DiskException.JOURNALS_NO_SPACE,
                    'There is no disk space for a new OSD with all existing journals.'
                )
        # ---------------------------------------------------------------------- #
        if cache:
            if cache != 'auto':
                if not self.is_cache_partition_avail(node_name,
                                                     str(cache).lower()):
                    raise DiskException(
                        DiskException.CACHE_NO_SPACE,
                        'There is no disk space for a new OSD with cache.')

            if not self.has_valid_cache(node_name):
                raise DiskException(
                    DiskException.CACHE_NO_SPACE,
                    'There is no disk space for a new OSD with all existing caches.'
                )
        # ---------------------------------------------------------------------- #

        # Adding OSD with Journal & Cache :
        # =================================
        if journal and cache and cache_type != "disabled":
            cmd = "python {} -disk_name {} -journal {} -cache {} -cache_type {}".format(
                ConfigAPI().get_admin_add_osd_job_script(), disk_name,
                str(journal).lower(),
                str(cache).lower(), str(cache_type))

        # Adding OSD with Journal :
        # =========================
        elif journal:
            cmd = "python {} -disk_name {} -journal {}".format(
                ConfigAPI().get_admin_add_osd_job_script(), disk_name,
                str(journal).lower())

        # Adding OSD with Cache :
        # =======================
        elif cache and cache_type != "disabled":
            cmd = "python {} -disk_name {} -cache {} -cache_type {}".format(
                ConfigAPI().get_admin_add_osd_job_script(), disk_name,
                str(cache).lower(), str(cache_type))

        # Adding OSD :
        # ============
        elif journal is None and cache is None:
            cmd = "python {} -disk_name {}".format(
                ConfigAPI().get_admin_add_osd_job_script(), disk_name)

        # stdout, stderr = exec_command(cmd)
        stdout, stderr = ssh_obj.exec_command(node_name, cmd)
        logger.info("Start add osd job {} ".format(stdout))

        return stdout
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.consul.ps_consul import RetryConsulException
from datetime import date, datetime
from requests import ConnectionError
from time import sleep
from consul import ConsulException
from PetaSAN.core.common.log import logger

from uuid import uuid1, uuid4
from flask.sessions import SessionInterface, SessionMixin
from PetaSAN.core.consul.base import BaseAPI
from PetaSAN.core.config.api import ConfigAPI
consul_session_key = ConfigAPI().get_consul_session_path()


class ConsulSession(SessionMixin):
    """Server-side session implementation.
    """
    def __init__(self, sid, *args, **kwargs):
        self.sid = sid
        self.get_all_sessions()
        self.permanent = True
        pass

    def __getitem__(self, key):
        self.get_all_sessions()
        self.__dict__[key]
        return self.__dict__[key]
Beispiel #14
0
class JobType:
    ADDDISK = "adddisk"
    DELETEOSD = "deleteosd"
    CLIENTSTRESS = "client_stress"
    STORAGELOAD = "storage_load"
    BENCHMANAGER = "bench_manager"
    ADDJOURNAL = "addjournal"
    DELETEJOURNAL = "deletejournal"
    DELETE_POOL = 'delete_pool'
    DELETE_DISK = 'delete_disk'
    ADDCACHE = "addcache"
    DELETECACHE = "deletecache"
    TEST = "test"


job_scripts = {JobType.ADDDISK: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-osd"),
               JobType.DELETEOSD: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-osd"),
               JobType.CLIENTSTRESS: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "client"),
               JobType.STORAGELOAD: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "storage"),
               JobType.BENCHMANAGER: " {} {}".format(ConfigAPI().get_benchmark_script_path(), "manager"),
               JobType.ADDJOURNAL: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-journal"),
               JobType.DELETEJOURNAL: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-journal"),
               JobType.DELETE_POOL: ConfigAPI().get_delete_pool_scipt(),
               JobType.DELETE_DISK: ConfigAPI().get_delete_disk_scipt(),
               JobType.ADDCACHE: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "add-cache"),
               JobType.DELETECACHE: "{} {}".format(ConfigAPI().get_admin_manage_node_script(), "delete-cache"),

               JobType.TEST: '/opt/petasan/scripts/test.sh -arg1 -arg2 '}


class Job(object):
def clean_ceph_local():
    config_api = ConfigAPI()
    call_cmd(" python {} ".format(config_api.get_node_clean_script_path()))
class CrushMap:

    CRUSH_SAVE_PATH = ConfigAPI().get_crush_save_path()

    def __init__(self):

        self.types = [-1 for i in range(20)]
        self.buckets = []
        self.rules = {}
        self.device_class = {}
        self.device_weight = {}

        self.lines_tunables = []
        self.lines_devices = []
        self.lines_types = []
        self.lines_buckets = []
        self.lines_rules = []

    def _decode_types(self):
        max_index = 0
        for line in self.lines_types:
            if line.endswith('type'):
                continue
            tokens = line.split()
            self.types[int(tokens[1])] = tokens[2]
            if int(tokens[1]) > max_index:
                max_index = int(tokens[1])
        self.types = self.types[:max_index + 1]

    def _decode_buckets(self):
        bucket = None
        for line in self.lines_buckets:
            if line.endswith('{'):
                bucket = {}
                bucket['items'] = []
                bucket['class_ids'] = {}
                tokens = line.split()
                type = tokens[0]
                type_id = self.types.index(type)
                bucket['type_id'] = type_id
                bucket['name'] = tokens[1]

            elif line.startswith('alg'):
                tokens = line.split()
                bucket['alg'] = tokens[1]

            elif line.startswith('hash'):
                tokens = line.split()
                bucket['hash'] = int(tokens[1])

            elif line.startswith('item'):
                tokens = line.split()

                if len(tokens) == 4 and tokens[1].startswith('osd.'):
                    # osd, append class and weight info
                    self.device_weight[tokens[1]] = tokens[3]

                    if self.device_class.has_key(tokens[1]):
                        item = tokens[1] + '#' + self.device_class[
                            tokens[1]] + '#' + self.device_weight[tokens[1]]
                    else:
                        item = tokens[
                            1] + '#' + 'class not defined' + '#' + self.device_weight[
                                tokens[1]]

                    bucket['items'].append(item)
                else:
                    bucket['items'].append(tokens[1])

            elif line.startswith('id'):
                tokens = line.split()
                id = int(tokens[1])
                if tokens[2] == 'class':
                    bucket['class_ids'][tokens[3]] = id
                else:
                    bucket['id'] = id

                bucket['hash'] = int(tokens[1])

            elif line.startswith('}'):
                self.buckets.append(bucket)

        #print(self.buckets)

    def _decode_rules(self):

        name = None
        body = None
        for line in self.lines_rules:
            if line.startswith('rule'):
                tokens = line.split()
                name = tokens[1]
                body = '{\n'
                continue

            if line.startswith('}'):
                body += '}'
                self.rules[name] = body
                continue

            body += line + '\n'

        #print self.rules

    def _decode_device_class(self):
        for line in self.lines_devices:
            if not line.startswith('device'):
                continue
            tokens = line.split()

            if len(tokens) != 5:
                continue

            if not tokens[2].startswith('osd') or not tokens[3].startswith(
                    'class'):
                continue
            self.device_class[tokens[2]] = tokens[4]

    def _get_rule_ids(self):
        ids = []
        for rule in self.rules:
            body = self.rules[rule]
            id = self._get_rule_id(body)
            if id:
                ids.append(id)
        return ids

    def _get_rule_id(self, body):
        lines = body.splitlines()
        for line in lines:
            if line.startswith('id'):
                tokens = line.split()
                if len(tokens) < 2:
                    continue
                return tokens[1]
        return None

    def _get_rule_class(self, body):
        lines = body.splitlines()
        for line in lines:
            line.strip()
            if line.startswith('#'):
                continue
            if 'step' in line and 'take' in line and 'class' in line:
                tokens = line.split()
                index = tokens.index('class')
                if index + 1 < len(tokens):
                    return tokens[index + 1]
        return None

    def _encode_buckets(self):
        self.lines_buckets = []
        bucket_names = set()

        for bucket in self.buckets:

            # duplicate name check
            if bucket['name'] in bucket_names:
                logger.error('Crush duplicate bucket name:' + bucket['name'])
                raise CrushException(CrushException.DUPLICATE_BUCKET_NAME,
                                     'Duplicate bucket name')
            bucket_names.add(bucket['name'])

            type = self.types[bucket['type_id']]
            self.lines_buckets.append(type + ' ' + bucket['name'] + ' {')
            self.lines_buckets.append('id ' + str(bucket['id']))

            if bucket.has_key('class_ids'):
                for c in bucket['class_ids']:
                    self.lines_buckets.append('id ' +
                                              str(bucket['class_ids'][c]) +
                                              ' class ' + c)

            self.lines_buckets.append('alg ' + bucket['alg'])
            self.lines_buckets.append('hash ' + str(bucket['hash']))

            if 'items' in bucket:
                for item in bucket['items']:
                    if item.startswith('osd'):
                        # osd
                        tokens = item.split('#')
                        osd_item = 'item ' + tokens[0]
                        if tokens[0] in self.device_weight:
                            osd_item += ' weight ' + self.device_weight[
                                tokens[0]]
                        self.lines_buckets.append(osd_item)
                    else:
                        self.lines_buckets.append('item ' + item)

            self.lines_buckets.append('}')

    def _encode_rules(self):
        self.lines_rules = []
        for name in self.rules:
            self.lines_rules.append('rule ' + name + ' ')
            body = self.rules[name]
            self.lines_rules += body.splitlines()

    def _get_rand_string(self, n):
        return ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(n))

    def _read_file_lines(self, backup=False):
        # Get which ceph user is using this function & get his keyring file path #
        ceph_auth = CephAuthenticator()

        call_cmd('mkdir -p ' + self.CRUSH_SAVE_PATH)
        cluster_name = configuration().get_cluster_name()

        rand = self._get_rand_string(6)
        bin_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.bin'
        txt_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.txt'

        cmd = 'ceph osd getcrushmap -o ' + bin_file + ' ' + ceph_auth.get_authentication_string(
        ) + ' --cluster ' + cluster_name
        ret, stdout, stderr = exec_command_ex(cmd)
        if ret != 0:
            if stderr and ('Connection timed out' in stderr
                           or 'error connecting' in stderr):
                logger.error('Error in Ceph Connection cmd:' + cmd)
                raise CephException(CephException.CONNECTION_TIMEOUT,
                                    'Connection Timeout Error')

            logger.error('General error in Ceph cmd:' + cmd + ' error:' +
                         stderr)
            raise CephException(CephException.GENERAL_EXCEPTION,
                                'General Ceph Error')

        cmd = 'crushtool -d ' + bin_file + ' -o ' + txt_file
        if not call_cmd(cmd):
            raise CrushException(CrushException.DECOMPILE,
                                 'Crush Decompile Error')

        with open(txt_file, 'r') as f:
            lines = f.readlines()
        lines = [line.strip() for line in lines]

        section = 'start'
        # for section tags see src/crush/CrushCompiler.cc decompile

        for line in lines:
            if len(line) == 0:
                continue
            if line.startswith('# begin crush map'):
                section = 'tunables'
                continue
            elif line.startswith('# devices'):
                section = 'devices'
                continue
            elif line.startswith('# types'):
                section = 'types'
                continue
            elif line.startswith('# buckets'):
                section = 'buckets'
                continue
            elif line.startswith('# rules'):
                section = 'rules'
                continue

            elif line.startswith('# choose_args'):
                section = 'end'
                break
            elif line.startswith('# end crush map'):
                section = 'end'
                break

            if section == 'tunables':
                self.lines_tunables.append(line)
            elif section == 'devices':
                self.lines_devices.append(line)
            elif section == 'types':
                self.lines_types.append(line)
            elif section == 'buckets':
                self.lines_buckets.append(line)
            elif section == 'rules':
                self.lines_rules.append(line)

        if backup:
            self._backup(txt_file)

        call_cmd('rm ' + txt_file)
        call_cmd('rm ' + bin_file)

    def _backup(self, crush_file):

        stamp = datetime.now().strftime('%Y%m%d-%H:%M:%S')
        backup_name = 'crushmap-' + stamp + '.txt'
        backup_path = self.CRUSH_SAVE_PATH + backup_name
        # backup on filesystem
        cmd = 'cp ' + crush_file + ' ' + backup_path
        call_cmd(cmd)
        # backup to consul
        cmd = 'consul kv put PetaSAN/crush/' + backup_name + ' @' + backup_path
        call_cmd(cmd)

    def _get_backup_file_name(self):
        t = datetime.now().strftime('%Y%m%d-%H:%M:%S')
        return self.CRUSH_SAVE_PATH + 'crushmap-' + t + '.txt'

    def _write_file_lines(self):

        rand = self._get_rand_string(6)
        bin_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.bin'
        txt_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.txt'

        with open(txt_file, 'w') as f:

            for line in self.lines_tunables:
                f.writelines(line + '\n')
            f.writelines('\n')
            for line in self.lines_devices:
                f.writelines(line + '\n')
            f.writelines('\n')
            for line in self.lines_types:
                f.writelines(line + '\n')
            f.writelines('\n')
            for line in self.lines_buckets:
                f.writelines(line + '\n')
            f.writelines('\n')
            for line in self.lines_rules:
                f.writelines(line + '\n')
            f.writelines('\n')

        cmd = 'crushtool -c ' + txt_file + ' -o ' + bin_file
        if not call_cmd(cmd):
            raise CrushException(CrushException.COMPILE, 'Crush Compile Error')

        cluster_name = configuration().get_cluster_name()
        cmd = 'ceph osd setcrushmap -i ' + bin_file + ' --cluster ' + cluster_name
        ret, stdout, stderr = exec_command_ex(cmd)
        if ret != 0:
            if stderr and ('Connection timed out' in stderr
                           or 'error connecting' in stderr):
                logger.error('Error in Ceph Connection cmd:' + cmd)
                raise CephException(CephException.CONNECTION_TIMEOUT,
                                    'Connection Timeout Error')

            logger.error('General error in Ceph cmd:' + cmd + ' error:' +
                         stderr)
            raise CephException(CephException.GENERAL_EXCEPTION,
                                'General Ceph Error')

        call_cmd('rm ' + txt_file)
        call_cmd('rm ' + bin_file)

    def read(self, backup=False):

        self._read_file_lines(backup)
        self._decode_device_class()
        self._decode_types()
        self._decode_buckets()
        self._decode_rules()

    def write(self):
        self._encode_rules()
        self._encode_buckets()
        self._write_file_lines()

    def get_bucket_types(self):
        return self.types

    def get_buckets(self):
        return self.buckets

    def set_buckets(self, buckets):
        self.buckets = buckets

    def get_rules(self):
        return self.rules

    def add_rule(self, name, body):
        if self.rules.has_key(name):
            logger.error('add rule error, rule name ' + name +
                         ' already exists')
            raise CrushException(CrushException.DUPLICATE_RULE_NAME,
                                 'Duplicate rule name')

        ids = self._get_rule_ids()
        id = self._get_rule_id(body)
        if id in ids:
            logger.error('add rule error, rule id ' + id + ' already exists')
            raise CrushException(CrushException.DUPLICATE_RULE_ID,
                                 'Duplicate rule id')

        dev_class = self._get_rule_class(body)
        if dev_class:
            if dev_class not in self.device_class.values():
                logger.error('add rule error, device class ' + dev_class +
                             ' does not exist')
                raise CrushException(CrushException.DEVICE_TYPE_NOT_EXISTS,
                                     'Device type does not exist')

        self.rules[name] = body

    def update_rule(self, name, body):
        if not self.rules.has_key(name):
            logger.error('edit rule error, rule name ' + name + ' not found')
            raise CrushException(CrushException.RULE_NOT_FOUND,
                                 'Rule not found')

        id_old = self._get_rule_id(self.rules[name])
        id_new = self._get_rule_id(body)

        if id_new != id_old:
            ids = self._get_rule_ids()
            if id_new in ids:
                logger.error('update rule error, rule id ' + id_new +
                             ' already exists')
                raise CrushException(CrushException.DUPLICATE_RULE_ID,
                                     'Duplicate rule id')

        self.rules[name] = body

    # def get_next_rule_id(self):
    #     next = 0
    #     ids = self._get_rule_ids()
    #     for id in ids :
    #         if next < int(id) :
    #             next = int(id)
    #     return str(next+1)

    def get_next_rule_id(self):
        ids = self._get_rule_ids()
        next_id = 0
        ids_set = set(ids)
        while True:
            if str(next_id) not in ids_set:
                return str(next_id)
            next_id += 1
        return str(next_id)

    def delete_rule(self, name):
        if not self.rules.has_key(name):
            logger.error('delete rule error, rule name ' + name + ' not found')
            raise CrushException(CrushException.RULE_NOT_FOUND,
                                 'Rule not found')
        del self.rules[name]
        parser.add_argument('-id',
                            help='Disk id such as 00006.',
                            required=True,
                            type=str)
        parser.add_argument('-ip',
                            help='IP address of path.',
                            required=True,
                            type=str)

        args = parser.parse_args()

        return args


__app_conf = ConfigAPI()
__node_info = configuration().get_node_info()
__ceph_api = CephAPI()
__disk_id = ''
__ip = ''
__network = NetworkAPI()
__consul_api = ConsulAPI()
__session = None


def main_catch(func, args):
    try:
        func(args)

    except Exception as e:
        logger.error(e.message)
Beispiel #18
0
    def run(self):

        cmd = "python {} server &".format(
            ConfigAPI().get_assignment_script_path())
        call_cmd(cmd)
#!/usr/bin/python
'''
 Copyright (C) 2019 Maged Mokhtar <mmokhtar <at> petasan.org>
 Copyright (C) 2019 PetaSAN www.petasan.org


 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU Affero General Public License
 as published by the Free Software Foundation

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU Affero General Public License for more details.
'''

from PetaSAN.core.common.cmd import kill_by_file_name
from PetaSAN.core.config.api import ConfigAPI

kill_by_file_name(ConfigAPI().get_sync_file_service())
kill_by_file_name(ConfigAPI().get_petasan_service())
kill_by_file_name(ConfigAPI().get_management_service())
Beispiel #20
0
class MangePathAssignment(object):
    def __init__(self):
        self.__app_conf = ConfigAPI()
        self.__context = AssignmentContext()
        self.__session_dict = ConsulAPI().get_sessions_dict(
            ConfigAPI().get_iscsi_service_session_name())
        self.__node_session_dict = dict()
        pass

    def get_assignments_stats(self):
        return self._filter_assignments_stats()

    def search_by_disk_name(self, disk_name):
        return self._filter_assignments_stats(filter_type=1,
                                              filter_text=disk_name)

    def search_by_ip(self, ip):
        return self._filter_assignments_stats(filter_type=2, filter_text=ip)

    def _filter_assignments_stats(self,
                                  filter_type=0,
                                  filter_text=None,
                                  set_session=False):

        __disk_consul_stopped = set()
        running_paths = dict()
        ceph_api = CephAPI()
        consul_api = ConsulAPI()
        disk_kvs = consul_api.get_disk_kvs()

        # Step 1 get all running paths.
        for consul_kv_obj in disk_kvs:
            path_key = str(consul_kv_obj.Key).replace(
                self.__app_conf.get_consul_disks_path(), "")
            disk_id = str(path_key).split('/')[0]
            if disk_id in __disk_consul_stopped:
                continue
            if consul_kv_obj.Value == "disk":
                disk_id = str(path_key).split('/')[0]

                # Step 2 avoid stopping disks
                if str(consul_kv_obj.Flags) == "1":
                    __disk_consul_stopped.add(disk_id)
                continue

            running_paths[path_key] = consul_kv_obj

        if len(running_paths) == 0:
            return AssignmentStats()

        # Step 3 get all images metadata
        images = ceph_api.get_disks_meta()

        assignment_stats = AssignmentStats()

        # Step 4 get current reassignments
        current_running_assignments = self.get_current_reassignment()
        if current_running_assignments is not None:
            assignment_stats.is_reassign_busy = True
            filter_type = 0  # we will stop any filter and get all data if here is running reassignment

        # Step 5 fill paths assignment info
        for path_key, consul_kv_obj in running_paths.iteritems():
            disk_id = str(path_key).split('/')[0]
            disk = next((img for img in images if img.id == disk_id), None)
            if disk is None:
                continue
            disk_path = Path()

            path_index = int(str(path_key).split(disk_id + "/")[1])
            path_str = disk.paths[path_index - 1]
            disk_path.load_json(json.dumps(path_str))

            path_assignment_info = PathAssignmentInfo()
            path_assignment_info.interface = disk_path.eth
            if disk_path.vlan_id:
                path_assignment_info.interface = disk_path.eth + "." + disk_path.vlan_id
            path_assignment_info.ip = disk_path.ip
            path_assignment_info.disk_name = disk.disk_name
            path_assignment_info.disk_id = disk_id
            path_assignment_info.index = path_index
            current_path = None
            if current_running_assignments is not None:
                current_path = current_running_assignments.get(disk_path.ip)
            if hasattr(consul_kv_obj,
                       "Session") and self.__session_dict.has_key(
                           consul_kv_obj.Session):
                # Fill status and node name for started paths
                path_assignment_info.node = self.__session_dict.get(
                    consul_kv_obj.Session).Node

                if current_running_assignments is not None:

                    if current_path is not None and current_path.status != -1:
                        path_assignment_info.status = current_path.status
                        path_assignment_info.target_node = current_path.target_node
                        if set_session:
                            # session refers to the node that lock this path assignment,This property helps to know the
                            # status of path and the node will handle this path
                            path_assignment_info.session = current_path.session
            elif current_path:
                path_assignment_info.node = current_path.node
                path_assignment_info.target_node = current_path.target_node
                path_assignment_info.status = current_path.status
                if set_session:
                    path_assignment_info.session = current_path.session

            # Step 6 search or get all
            if filter_type == 1 and filter_text is not None and len(
                    str(filter_text).strip()) > 0:  # by disk name
                if filter_text.strip().lower(
                ) in path_assignment_info.disk_name.lower():
                    assignment_stats.paths.append(path_assignment_info)
            elif filter_type == 2 and filter_text is not None and len(
                    str(filter_text).strip()) > 0:  # by ip
                if filter_text.strip() == path_assignment_info.ip.strip():
                    assignment_stats.paths.append(path_assignment_info)
                    break
            else:
                assignment_stats.paths.append(path_assignment_info)

            # Step 7 set all online nodes
        assignment_stats.nodes = self._get_nodes()

        return assignment_stats

    def get_current_reassignment(self):
        paths = ConsulAPI().get_assignments()
        if paths is not None:
            for ip, path_assignment_info in paths.iteritems():
                if not hasattr(path_assignment_info, "session"):
                    logger.info("Path {} not locked by node.".format(
                        path_assignment_info.ip))
                if not hasattr(
                        path_assignment_info,
                        "session") and path_assignment_info.status not in [
                            ReassignPathStatus.succeeded,
                            ReassignPathStatus.failed
                        ]:
                    path_assignment_info.status = ReassignPathStatus.failed
        return paths

    def set_new_assignments(self, paths_assignment_info):
        logger.info("Set new assignment.")
        if self.get_current_reassignment() is not None:
            raise Exception("There is already running assignment.")

        config_api = ConfigAPI()
        consul_api = ConsulAPI()
        logger.info("Delete old assignments.")
        consul_api.delete_assignments()
        session = consul_api.get_new_session_ID(
            config_api.get_assignment_session_name(),
            configuration().get_node_name(), True)
        if consul_api.lock_key(config_api.get_consul_assignment_path(),
                               session, "root"):
            logger.info("Lock assignment root.")
            for path_assignment_info in paths_assignment_info:
                path_assignment_info.status = ReassignPathStatus.pending
                consul_api.set_path_assignment(
                    path_assignment_info,
                    self._get_node_session(path_assignment_info.target_node))
                logger.info(
                    "New assignment for {} ,disk {}, from node {}  and to node {} with status {}"
                    .format(path_assignment_info.ip,
                            path_assignment_info.disk_id,
                            path_assignment_info.node,
                            path_assignment_info.target_node,
                            path_assignment_info.status))
        else:
            logger.error("Can't lock paths assignment key.")
            raise Exception("Can't lock paths assignment key.")

    def run(self):

        cmd = "python {} server &".format(
            ConfigAPI().get_assignment_script_path())
        call_cmd(cmd)

    def _get_nodes(self):
        consul_api = ConsulAPI()
        # Get all PetaSAN nodes[management or storage].
        node_list = consul_api.get_node_list()
        # Get online nodes from consul.
        consul_members = consul_api.get_consul_members()
        petasan_node_list = []
        for i in node_list:
            if not i.is_iscsi:
                continue
            if i.name in consul_members:
                petasan_node_list.append(i.name)

        return petasan_node_list

    def remove_assignment(self):
        consul_api = ConsulAPI()
        if consul_api.get_assignments() is not None:
            consul_api.delete_assignments()

    def auto(self, type=1):
        logger.info("User start auto reassignment paths.")
        assignments_stats = self.get_assignments_stats()
        if assignments_stats.is_reassign_busy:
            logger.error("There is already reassignment running.")
            raise Exception("There is already reassignment running.")

        ConsulAPI().drop_all_node_sessions(
            self.__app_conf.get_consul_assignment_path(),
            configuration().get_node_name())
        sleep(3)

        assignments_stats.paths = [
            path for path in assignments_stats.paths
            if len(path.node.strip()) > 0 and path.status == -1
        ]
        self.__context.paths = assignments_stats.paths
        self.__context.nodes = assignments_stats.nodes
        for plugin in self._get_new_plugins_instances(auto_plugins):
            if plugin.is_enable() and plugin.get_plugin_id() == type:
                paths_assignments = plugin.get_new_assignments()
                if len(paths_assignments) == 0:
                    logger.info("There is no node under average.")
                    return
                self.set_new_assignments(paths_assignments)
                break
        self.run()

    def manual(self, paths_assignment_info, assign_to="auto"):

        assignments_stats = self.get_assignments_stats()
        if assignments_stats.is_reassign_busy:
            logger.error("There is already reassignment running.")
            raise Exception("There is already reassignment running.")
        ConsulAPI().drop_all_node_sessions(
            self.__app_conf.get_consul_assignment_path(),
            configuration().get_node_name())
        sleep(3)  # Wait to be sure the session dropped
        if assign_to == "auto":

            logger.info(
                "User start auto reassignment paths for selected paths.")
            assignments_stats.paths = [
                path for path in assignments_stats.paths
                if len(path.node.strip()) > 0 and path.status == -1
            ]
            self.__context.paths = assignments_stats.paths
            self.__context.nodes = assignments_stats.nodes
            self.__context.user_input_paths = paths_assignment_info
            for plugin in self._get_new_plugins_instances(auto_plugins):
                if plugin.is_enable() and plugin.get_plugin_id() == 1:
                    paths_assignments = plugin.get_new_assignments()
                    self.set_new_assignments(paths_assignments)
                    logger.info(
                        "User start auto reassignment paths for selected paths."
                    )
                    self.run()
                    break
            pass
        else:

            for path_assignment_info in paths_assignment_info:
                path_assignment_info.target_node = assign_to
                path_assignment_info.status = ReassignPathStatus.pending
            logger.info(
                "User start manual reassignment paths for selected paths.")
            self.set_new_assignments(paths_assignment_info)

            self.run()

    def process(self):
        logger.info("Start process reassignments paths.")
        max_retry = 100
        current_reassignments = self.get_current_reassignment()
        config = configuration()
        assignment_script_path = ConfigAPI().get_assignment_script_path()
        if current_reassignments is None:
            return
        for ip, path_assignment_info in current_reassignments.iteritems():
            logger.info("process path {} and its status is {}".format(
                ip, path_assignment_info.status))
            if path_assignment_info.status == ReassignPathStatus.pending:
                logger.info(
                    "Move action,try clean disk {} path {} remotely on node {}."
                    .format(path_assignment_info.disk_name,
                            path_assignment_info.disk_id,
                            path_assignment_info.node))

                status = False
                try:

                    cmd = "python {} path_host -ip {} -disk_id {}".format(
                        assignment_script_path, path_assignment_info.ip,
                        path_assignment_info.disk_id)
                    out, err = ssh().exec_command(path_assignment_info.node,
                                                  cmd)
                    logger.info(cmd)
                    # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id)
                except Exception as ex:
                    logger.exception(ex.message)
                    out = ""

                if str(out).strip() == "0":
                    logger.info("Move action passed")
                    status = True

                current_path_assignment_info = None
                if status:
                    for i in xrange(0, max_retry):
                        logger.debug(
                            "Wait to update status of path {}.".format(
                                path_assignment_info.ip))
                        sleep(0.25)
                        reassignments = self.get_current_reassignment()
                        if reassignments:
                            current_path_assignment_info = reassignments.get(
                                path_assignment_info.ip)
                            if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                                continue
                            else:
                                logger.info(
                                    "Process completed for path {} with status {}."
                                    .format(
                                        current_path_assignment_info.ip,
                                        current_path_assignment_info.status))
                                break
                    if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                        self.update_path(current_path_assignment_info,
                                         ReassignPathStatus.failed)
                        logger.info(
                            "Move action,failed ,disk {} path {}.".format(
                                path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))

                else:
                    self.update_path(path_assignment_info,
                                     ReassignPathStatus.failed)
                    logger.info(
                        "Move action ,failed to clean disk {} path {} remotely on node ."
                        .format(path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))
        sleep(10)  # wait for display status to user if needed
        logger.info("Process completed.")
        self.remove_assignment()
        ConsulAPI().drop_all_node_sessions(
            self.__app_conf.get_consul_assignment_path(),
            config.get_node_name())

    def _clean_iscsi_config(self, disk_id, path_index, iqn):

        logger.debug("Move action ,start clean disk {} path {}.".format(
            disk_id, path_index))

        lio_api = LioAPI()

        try:

            # Get tpgs for iqn.
            tpgs = lio_api.get_iqns_with_enabled_tpgs().get(iqn, None)
            if not iqn or not tpgs or len(tpgs) == 0:
                logger.info("Move action ,could not find ips for %s " %
                            disk_id)
            # Remove the assigned ips from our interfaces
            elif tpgs and len(tpgs) > 0:
                # Get assigned ips for each path.
                for tpg, ips in tpgs.iteritems():
                    if tpg == str(path_index + 1):
                        lio_api.disable_path(iqn, tpg)
                        logger.info(
                            "Move action,cleaned disk {} path {}.".format(
                                disk_id, path_index))
                        break
        except Exception as e:
            logger.error("Move action,could not clean disk path for %s" %
                         disk_id)
            return False
        logger.debug("Move action end clean disk {} path {}.".format(
            disk_id, path_index))
        return True

    def clean_source_node(self, ip, disk_id):
        if not self.update_path(ip, ReassignPathStatus.moving):
            return False

        # pool = CephAPI().get_pool_bydisk(disk_id)
        pool = self._get_pool_by_disk(disk_id)
        if not pool:
            logger.error('Could not find pool for disk ' + disk_id)
            return False

        disk = CephAPI().get_disk_meta(disk_id, pool)
        paths_list = disk.paths
        disk_path = None
        path_index = -1

        for i in xrange(0, len(paths_list)):
            path_str = paths_list[i]
            path = Path()
            path.load_json(json.dumps(path_str))
            if path.ip == ip:
                disk_path = path
                path_index = i
                break
        if disk_path:
            self._clean_iscsi_config(disk_id, path_index, disk.iqn)
            network = Network()
            NetworkAPI().delete_ip(path.ip, path.eth, path.subnet_mask)
            if network.is_ip_configured(ip):
                logger.error(
                    "Move action,cannot clean newtwork config for disk {} path {}."
                    .format(disk_id, path_index))
                self.update_path(ip, ReassignPathStatus.failed)
                return False
            logger.info(
                "Move action,clean newtwork config for disk {} path {}.".
                format(disk_id, path_index))
            key = self.__app_conf.get_consul_disks_path(
            ) + disk_id + "/" + str(path_index + 1)
            consul_api = ConsulAPI()
            session = self._get_node_session(configuration().get_node_name())
            if ConsulAPI().is_path_locked_by_session(key, session):
                consul_api.release_disk_path(key, session, None)
                logger.info("Move action,release disk {} path {}.".format(
                    disk_id, path_index + 1))
        else:
            self.update_path(ip, ReassignPathStatus.failed)
            return False

        return True

    def update_path(self, ip, status):
        logger.info("Updating path  {} status to {} ".format(ip, status))
        current_reassignments = self.get_current_reassignment()
        if current_reassignments:
            path_assignment_info = current_reassignments.get(ip)
            if path_assignment_info:
                path_assignment_info.status = status
                if ConsulAPI().update_path_assignment(path_assignment_info):
                    logger.info("Path  {} status updated to {} ".format(
                        ip, status))
                    return True
        logger.info("Path  {} status failed to update status to {} ".format(
            ip, status))
        return False

    def _get_new_plugins_instances(self, modules):

        plugins = []
        for cls in modules:
            try:
                # import plugins module
                mod_obj = __import__(cls)
                for i in str(cls).split(".")[1:]:
                    mod_obj = getattr(mod_obj, i)
                # Find all plugins in module and create instances
                for mod_prop in dir(mod_obj):
                    # Ignore private
                    if not str(mod_prop).startswith("__"):
                        attr = getattr(mod_obj, mod_prop)
                        attr_str = str(attr)
                        attr_type_str = str(type(attr))
                        # Find plugin from type ABCMeta , plugin class name contains 'plugin' and not contains base
                        if attr_type_str.find(
                                "ABCMeta") > -1 and attr_str.find(
                                    "Base") == -1 and attr_str.find("Plugin"):
                            instance = attr(self.__context)
                plugins.append(instance)
            except Exception as e:
                logger.error("Error load plugin {}.".format(cls))
        return plugins

    def get_forced_paths(self):
        paths = None
        assignments = self._filter_assignments_stats(set_session=True)

        if not assignments.is_reassign_busy:
            return paths

        for path_assignment_info in assignments.paths:
            if path_assignment_info.status == ReassignPathStatus.moving and hasattr(
                    path_assignment_info, "session"):
                if paths is None:
                    paths = dict()
                paths[path_assignment_info.disk_id + "/" +
                      str(path_assignment_info.index)] = path_assignment_info

        return paths

    def _get_node_session(self, node_name):
        logger.info(self.__node_session_dict)
        if self.__session_dict:
            session = self.__node_session_dict.get(node_name)
            if session is not None:
                return session
            else:
                for sess, node in self.__session_dict.iteritems():
                    if node.Node == node_name:
                        self.__node_session_dict[node] = sess
                        return sess

    def _get_pool_by_disk(self, disk_id):
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        pool = consul_api.get_disk_pool(disk_id)
        if pool:
            logger.info('Found pool:{} for disk:{} via consul'.format(
                pool, disk_id))
            return pool
        pool = ceph_api.get_pool_bydisk(disk_id)
        if pool:
            logger.info('Found pool:{} for disk:{} via ceph'.format(
                pool, disk_id))
            return pool

        logger.error('Could not find pool for disk ' + disk_id)
        return None
Beispiel #21
0
    def build(self):
        try:
            self.__status_report = StatusReport()
            conf = configuration()

            if len(conf.get_cluster_info().management_nodes) == 0:
                node_num = len(conf.get_cluster_info().management_nodes) + 1
                self.__status_report.nod_num = node_num
                NTPConf().setup_ntp_local()
                if conf.add_management_node() != Status().done:
                    self.__status_report.success = False
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_cant_add_node")

                logger.info(
                    "Node 1 added, cluster requires 2 other nodes to build.")
                self.run_post_deploy_script()
                return BuildStatus().OneManagementNode

            elif len(conf.get_cluster_info().management_nodes) == 1:
                node_num = len(conf.get_cluster_info().management_nodes) + 1
                self.__status_report.nod_num = node_num

                connection_status = self.check_connections()
                if not connection_status.success:
                    self.__status_report.failed_tasks.extend(
                        connection_status.failed_tasks)
                    logger.error("Connection ping error.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().connection_error

                NTPConf().setup_ntp_local()

                if conf.add_management_node() != Status().done:
                    self.__status_report.success = False
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_cant_add_node")
                    return BuildStatus().error
                if not self.__sync_cluster_config_file():
                    return BuildStatus().error

                logger.info(
                    "Node 2 is added, cluster requires 1 other node to build.")
                self.run_post_deploy_script()
                return BuildStatus().TwoManagementNodes

            elif len(conf.get_cluster_info().management_nodes) == 2:
                node_num = len(conf.get_cluster_info().management_nodes) + 1
                self.__status_report.nod_num = node_num

                connection_status = self.check_connections()
                if not connection_status.success:
                    self.__status_report.failed_tasks.extend(
                        connection_status.failed_tasks)
                    logger.error("Connection ping error.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().connection_error

                status = self.check_remote_connection()
                if not status.success:
                    self.__status_report = status
                    return BuildStatus().error

                NTPConf().setup_ntp_local()

                logger.info("Stopping petasan services on all nodes.")
                self.stop_petasan_services()
                logger.info("Starting local clean_ceph.")
                clean_ceph()
                logger.info("Starting local clean_consul.")
                clean_consul()

                status = build_consul()
                if not status.success:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)
                    logger.error("Could not build consul.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_consul_error

                status = build_monitors()
                if not status.success:
                    self.__status_report = status
                    logger.error("Could not build ceph monitors.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_monitors_error

                status = build_osds()
                if not status.success:
                    self.__status_report = status
                    logger.error("Could not build ceph OSDs.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_osd_error
                else:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)

                logger.info("Main core components deployed.")

                if not self.__commit_management_nodes():
                    self.__status_report.success = False
                    logger.error("Could not commit node.")
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_couldnt_commit_node")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().error

                logger.info("Starting all services.")
                self.start_petasan_services()

                if not self.add__node_to_hosts_file():
                    self.__status_report.success = False
                    logger.error("Could not add node to hosts file.")
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_couldnt_add_node_hosts")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().error

                SharedFS().setup_management_nodes()

                if conf.add_management_node() != Status().done:
                    self.__status_report.success = False
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_couldnt_add_node_config")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().error

                logger.info("Updating rbd pool.")
                if not create_rbd_pool():
                    self.__status_report.success = False
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_couldnt_update_rbd")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().error

                logger.info("Creating EC Profiles.")
                if not create_ec_profiles():
                    self.__status_report.success = False
                    self.__status_report.failed_tasks.append(
                        "core_cluster_deploy_couldnt_create_ec_profiles")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().error

                logger.info(
                    "Waiting for ceph to reach active and clean status.")
                test_active_clean()
                if not self.__sync_cluster_config_file():
                    return BuildStatus().error

                self.run_post_deploy_script()
                self.kill_petasan_console(True)
                logger.info("Node 3 added and cluster is now ready.")

            elif len(
                    conf.get_cluster_info().management_nodes
            ) == 3 and not os.path.exists(ConfigAPI().get_replace_file_path()):
                # ------------------------------ Join ------------------------------ #
                # ------------------------------------------------------------------ #
                node_num = len(conf.get_cluster_info().management_nodes) + 1
                self.__status_report.nod_num = node_num
                logger.info("Joining node to running cluster.")

                connection_status = self.check_connections()
                if not connection_status.success:
                    self.__status_report.failed_tasks.extend(
                        connection_status.failed_tasks)
                    logger.error("Connection ping error.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().connection_error

                status = self.check_remote_connection()
                NTPConf().setup_ntp_local()

                if not status.success:
                    self.__status_report = status
                    return BuildStatus().error

                logger.info("Stopping petasan services on local node.")
                self.stop_petasan_services(remote=False)
                logger.info("Starting local clean_ceph.")
                clean_ceph_local()
                logger.info("Starting local clean_consul.")
                clean_consul_local()

                status = build_consul_client()
                if not status.success:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)
                    logger.error("Could not build consul client.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_consul_error

                status = copy_ceph_config_from_mon()
                if not status.success:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)
                    logger.error("Could not copy ceph config.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_consul_error

                status = create_osds_local()
                if not status.success:
                    self.__status_report = status
                    logger.error("Could not build ceph OSDs.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_osd_error
                else:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)

                logger.info("Main core components deployed.")
                logger.info("Staring all services")
                self.start_petasan_services(remote=False)
                test_active_clean()
                if not self.__commit_local_node():
                    test_active_clean()
                    if not self.__commit_local_node():
                        self.__status_report.success = False
                        logger.error("Could not commit node.")
                        self.__status_report.failed_tasks.append(
                            "core_cluster_deploy_couldnt_commit_node_join")
                        logger.error(self.__status_report.failed_tasks)
                        os.remove(ConfigAPI().get_cluster_info_file_path())
                        return BuildStatus().error

                if not self.add__node_to_hosts_file(remote=False):
                    test_active_clean()
                    if not self.add__node_to_hosts_file(remote=False):
                        self.__status_report.success = False
                        logger.error("Could not add node to hosts file.")
                        self.__status_report.failed_tasks.append(
                            "core_cluster_deploy_couldnt_add_node_hosts")
                        logger.error(self.__status_report.failed_tasks)
                        os.remove(ConfigAPI().get_cluster_info_file_path())
                        return BuildStatus().error

                logger.info("Node successfully joined to cluster.")
                self.kill_petasan_console(False)
                if os.path.exists(ConfigAPI().get_replace_file_path()):
                    os.remove(ConfigAPI().get_replace_file_path())

                self.run_post_deploy_script()
                return BuildStatus().done_joined

            elif len(conf.get_cluster_info().management_nodes
                     ) == 3 and os.path.exists(
                         ConfigAPI().get_replace_file_path()):
                # ----------------------------- Replace ---------------------------- #
                # ------------------------------------------------------------------ #
                node_num = len(conf.get_cluster_info().management_nodes) + 1
                self.__status_report.nod_num = node_num
                logger.info("Replace node is starting.")

                connection_status = self.check_connections()
                if not connection_status.success:
                    self.__status_report.failed_tasks.extend(
                        connection_status.failed_tasks)
                    logger.error("Connection ping error.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().connection_error

                status = self.check_remote_connection()
                NTPConf().setup_ntp_local()

                if not status.success:
                    self.__status_report = status
                    return BuildStatus().error

                logger.info("Stopping petasan services on local node.")
                self.stop_petasan_services(remote=False)
                logger.info("Starting clean_ceph.")
                clean_ceph_local()
                logger.info("Starting local clean_consul.")
                clean_consul_local()

                status = replace_consul_leader()
                if not status.success:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)
                    logger.error("Could not replace consul leader.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_consul_error

                status = replace_local_monitor()
                if not status.success:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_monitors_error

                status = create_osds_local()
                if not status.success:
                    self.__status_report = status
                    logger.error("Could not build ceph OSDs.")
                    logger.error(self.__status_report.failed_tasks)
                    return BuildStatus().build_osd_error
                else:
                    self.__status_report.failed_tasks.extend(
                        status.failed_tasks)

                logger.info("Main core components deployed.")
                logger.info("Starting all services.")
                self.start_petasan_services(remote=False)
                test_active_clean()

                SharedFS().rebuild_management_node()

                logger.info("Node successfully added to cluster.")
                self.run_post_deploy_script()
                self.kill_petasan_console(False)
                os.remove(ConfigAPI().get_replace_file_path())
                return BuildStatus().done_replace

        except Exception as ex:
            config_api = ConfigAPI()
            if os.path.exists(config_api.get_cluster_info_file_path()):
                os.remove(config_api.get_cluster_info_file_path())
            logger.exception(ex.message)
            return BuildStatus().error

        return BuildStatus().done
def clean_consul_local():
    logger.info("Trying to clean Consul on local node")
    PetaSAN.core.common.cmd.call_cmd('python ' +
                                     ConfigAPI().get_consul_stop_script_path())
    PetaSAN.core.common.cmd.call_cmd(
        'python ' + ConfigAPI().get_consul_clean_script_path())
    def __read(self):
        # Run rados benchmark on selected nodes
        for node in self.clients:
            cmd = "python " + ConfigAPI().get_node_stress_job_script_path(
            ) + " -d {} -t {}  -m r -p {}".format(self.stress_duration,
                                                  self.threads, self.pool)
            logger.info("Run rados read cmd on node {} : ".format(node) + cmd)
            out, err = ssh().exec_command(node, cmd)
            # get job id from output and assign to its node
            if not err:
                self.read_jobs[int(out)] = node

        logger.info("Wait time before collect node state.")
        sleep(self.wait_for_collect_state)

        # Get state of storage nodes
        for node in self.storage_nodes:
            cmd = "python " + ConfigAPI().get_storage_load_job_script_path(
            ) + " -d {} ".format(self.state_duration)
            out, err = ssh().exec_command(node, cmd)
            logger.info("Run sar state cmd on node {} : ".format(node) + cmd)
            if not err:
                self.read_jobs[int(out)] = node
        # Wait to complete all jobs
        sleep(self.stress_duration - self.wait_for_collect_state)
        # Check the completed jobs and get the output
        while (len(self.read_jobs) > 0):
            remove_job_ids = []
            for job_id, node_name in self.read_jobs.iteritems():
                cmd = "python " + ConfigAPI().get_job_info_script_path(
                ) + " -id {} -t {}".format(job_id, 1)
                out, err = ssh().exec_command(node_name, cmd)
                # Job completed
                if int(out) == 1:
                    remove_job_ids.append(job_id)
                    cmd = "python " + ConfigAPI().get_job_info_script_path(
                    ) + " -id {} -t {}".format(job_id, 2)
                    out, err = ssh().exec_command(node_name, cmd)
                    logger.debug(
                        "Get job output by cmd {} from node {} ".format(
                            cmd, node_name))
                    logger.debug("Output is {} ".format(out))
                    # job passed and get our output
                    if out.startswith(self.output_split_text) or out.find(
                            self.output_split_text) > -1:
                        out = out.split(self.output_split_text)[1]
                else:
                    continue
                # Get rados IOPs output
                if node_name in self.clients:
                    rados_rs = RadosResult()
                    if out:
                        rados_rs.load_json(out)
                        self.report.read_iops += rados_rs.iops
                        self.report.read_throughput += rados_rs.throughput
                elif node_name in self.storage_nodes:
                    # Get sar output
                    sar_rs = SarResult()
                    if out:
                        sar_rs.load_json(out)
                        self.report.read_nodes.append(sar_rs)

            # Remove completed jobs
            for i in remove_job_ids:
                self.read_jobs.pop(i)
            if len(self.read_jobs) > 0:
                sleep(5)