예제 #1
0
    def _clean_iscsi_config(self, disk_id, path_index, iqn):

        logger.debug("Move action ,start clean disk {} path {}.".format(
            disk_id, path_index))

        lio_api = LioAPI()

        try:

            # Get tpgs for iqn.
            tpgs = lio_api.get_iqns_with_enabled_tpgs().get(iqn, None)
            if not iqn or not tpgs or len(tpgs) == 0:
                logger.info("Move action ,could not find ips for %s " %
                            disk_id)
            # Remove the assigned ips from our interfaces
            elif tpgs and len(tpgs) > 0:
                # Get assigned ips for each path.
                for tpg, ips in tpgs.iteritems():
                    if tpg == str(path_index + 1):
                        lio_api.disable_path(iqn, tpg)
                        logger.info(
                            "Move action,cleaned disk {} path {}.".format(
                                disk_id, path_index))
                        break
        except Exception as e:
            logger.error("Move action,could not clean disk path for %s" %
                         disk_id)
            return False
        logger.debug("Move action end clean disk {} path {}.".format(
            disk_id, path_index))
        return True
예제 #2
0
def get_next_partition_index(dev):
    """
    Get the next free partition index on a given device.

    :return: Index number (> 1 if there is already a partition on the device)
    or 1 if there is no partition table.
    """
    try:
        output, err = exec_command('parted --machine -- {} print'.format(dev))
        lines = output
    except subprocess.CalledProcessError as e:
        logger.info('cannot read partition index; assume it '
                    'isn\'t present\n (Error: %s)' % e)
        return 1

    if not lines:
        raise logger.error('parted failed to output anything')
    logger.debug('get_free_partition_index: analyzing ' + lines)
    if ('CHS;' not in lines and
                'CYL;' not in lines and
                'BYT;' not in lines):
        raise logger.error('parted output expected to contain one of ' +
                           'CHH; CYL; or BYT; : ' + lines)
    if os.path.realpath(dev) not in lines:
        raise logger.error('parted output expected to contain ' + dev + ': ' + lines)
    _, partitions = lines.split(os.path.realpath(dev))
    numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE)
    partition_numbers = map(int, numbers_as_strings)
    if partition_numbers:
        return max(partition_numbers) + 1
    else:
        return 1
예제 #3
0
 def __process(self):
     logger.debug("Start process, node session id is {}.".format(
         self.__session))
     self.__last_acquire_succeeded = True
     self.__ignored_acquire_paths = dict()
     while self.__do_process() != True:
         pass
     logger.debug("End process.")
예제 #4
0
 def __read_resources_consul(self):
     logger.debug("Start read resources consul.")
     self.__paths_per_session = {}
     self.__total_cluster_paths = 0
     unlock_kvs = set()
     self.__paths_consul_locked_node = dict()
     try:
         disk_kvs = ConsulAPI().get_disk_kvs()
         for kv in disk_kvs:
             key = str(kv.Key).replace(
                 self.__app_conf.get_consul_disks_path(), "")
             disk_id = str(key).split('/')[0]
             if disk_id in self.__disk_consul_stopped:
                 continue
             if kv.Value == "disk":
                 disk_id = str(key).split('/')[0]
                 self.__paths_per_disk_local[disk_id] = 0
                 if str(kv.Flags) == "1":
                     self.__disk_consul_stopped.add(disk_id)
                 continue
             # Count paths in the cluster.
             self.__total_cluster_paths += 1
             if hasattr(kv, "Session"):
                 disk_id = str(key).split('/')[0]
                 disks = self.__paths_consul_locked_node.get(
                     kv.Session, dict())
                 paths = disks.get(disk_id, 0)
                 disks[disk_id] = paths + 1
                 self.__paths_consul_locked_node[kv.Session] = disks
                 # The count of paths for each session
                 if self.__paths_per_session.has_key(kv.Session):
                     count = self.__paths_per_session.get(kv.Session)
                     self.__paths_per_session[kv.Session] = count + 1
                 else:
                     self.__paths_per_session[kv.Session] = 1
                 if kv.Session == self.__session:
                     self.__paths_consul_locked_node.add(key)
                     disk_paths_count = self.__paths_per_disk_local.get(
                         disk_id, 0) + 1
                     self.__paths_per_disk_local[disk_id] = disk_paths_count
             # unlocked paths
             elif not hasattr(kv, "Session"):
                 unlock_kvs.add(kv)
         # Filter unlocked paths
         for kv in unlock_kvs:
             key = str(kv.Key).replace(
                 self.__app_conf.get_consul_disks_path(), "")
             disk_id = str(key).split('/')[0]
             if self.__paths_per_disk_local.get(disk_id, 0) > 0:
                 self.__paths_consul_unlocked_siblings[key] = kv.CreateIndex
             else:
                 self.__paths_consul_unlocked_firstborn[
                     key] = kv.CreateIndex
     except Exception as e:
         logger.error("Could not read consul resources.")
         logger.exception(e)
         raise e
     logger.debug("End read resources consul.")
예제 #5
0
def _send_graphite(key, val):
    #cmd = "echo 'PetaSAN.NodeStats.{node}.{key} {val}' `date +%s` |  nc -q0 {server}  2003".\
    cmd = 'echo \"PetaSAN.NodeStats.{node}.{key} {val} `date +%s` \"  |  nc -q0 {server}  2003'.\
        format(node=node_name, key=key, val=val, server=leader_ip)
    logger.debug(cmd)

    ret, stdout, stderr = exec_command_ex2(cmd)  #use this function to check stderr
    if stderr is not None and len(stderr) > 0:
        raise Exception("Error running echo command :" + cmd)
예제 #6
0
def arping(args):
    i =0
    cmd = "arping -A {} -I {} -c {}".format(args.ip,args.eth,args.c)
    logger.debug(cmd)

    while i < args.t:
        i+=1
        call_cmd(cmd)
        sleep(args.i)
예제 #7
0
 def __clean_unused_ips(self):
     ips = Network().get_all_configured_ips()
     for ip, eth_name in ips.iteritems():
         ip, netmask = str(ip).split("/")
         if ip not in self.__local_ips and ip != self.__node_info.backend_1_ip and \
                         ip != self.__node_info.backend_2_ip and ip != self.__node_info.management_ip:
             NetworkAPI().delete_ip(ip, eth_name, netmask)
             logger.debug("Clean unused ip {} on interface {}.".format(
                 ip, eth_name))
예제 #8
0
 def __clean_unused_rbd_images(self):
     ceph_api = CephAPI()
     rbd_images = ceph_api.get_mapped_images()
     if rbd_images is None:
         return
     for image, mapped_count in rbd_images.iteritems():
         if image not in self.__backstore:
             if int(mapped_count) > 0:
                 for i in range(0, int(mapped_count)):
                     ceph_api.unmap_image(image)
                     logger.debug("Unmapped unused image {}.".format(image))
예제 #9
0
 def __clean_unused_iqns(self):
     status = False
     lio_api = LioAPI()
     for iqn in lio_api.get_unused_iqns():
         disk_id = str(iqn).split(":")[1]
         image_name = self.__image_name_prefix + str(disk_id)
         lio_api.delete_target(image_name, iqn)
         CephAPI().unmap_image(image_name)
         status = True
         logger.debug("Clean unused iqn {}.".format(iqn))
     return status
예제 #10
0
 def __unlock_consul_path(self, path):
     try:
         logger.debug("Unlock {} path locked by session {}.".format(
             path, self.__session))
         consul_api = ConsulAPI()
         consul_api.release_disk_path(
             self.__app_conf.get_consul_disks_path() + path, self.__session,
             None)
         logger.info("Unlock path %s" % path)
     except Exception as e:
         logger.error("Could not unlock path %s" % path)
         raise e
예제 #11
0
def unmount(
        path,
        do_rm=True,
):
    """
    Unmount and removes the given mount point.
    """
    try:
        logger.debug('Unmounting %s', path)
        exec_command_ex('/bin/umount -- ' + path)

    except subprocess.CalledProcessError as e:
        raise Exception('Error unmonting disk.', e)
    if not do_rm:
        return
    os.rmdir(path)
예제 #12
0
def mount(
        dev,
        fstype,
        options,
):
    """
    Mounts a device with given filessystem type and
    mount options to a tempfile path under /var/lib/ceph/tmp.
    """
    # sanity check: none of the arguments are None
    if dev is None:
        raise ValueError('dev may not be None')
    if fstype is None:
        raise ValueError('fstype may not be None')

    # pick best-of-breed mount options based on fs type
    if options is None:
        options = "noatime"

    myTemp = STATEDIR + '/tmp'
    # mkdtemp expect 'dir' to be existing on the system
    # Let's be sure it's always the case
    if not os.path.exists(myTemp):
        os.makedirs(myTemp)

    # mount
    path = tempfile.mkdtemp(
        prefix='mnt.',
        dir=myTemp,
    )
    try:
        logger.debug('Mounting %s on %s with options %s', dev, path, options)
        cmd = 'mount -t ' + fstype + '-o ' + options + ' -- ' + dev + ' ' + path
        exec_command_ex(cmd)

        if which('restorecon'):
            cmd = 'restorecon ' + path
            exec_command_ex(cmd)
    except subprocess.CalledProcessError as e:
        try:
            os.rmdir(path)
        except (OSError, IOError):
            pass
        raise Exception('Error Mounting disk.', e)

    return path
예제 #13
0
def new_mon_keyring(cluster):
    logger.debug('Creating a random mon key...')
    mon_keyring = '[mon.]\nkey = %s\ncaps mon = allow *\n' % generate_auth_key()

    keypath = '{name}.mon.keyring'.format(
        name=cluster,
        )
    oldmask = os.umask(077)
    logger.debug('Writing monitor keyring to %s...', keypath)
    try:
        tmp = '%s.tmp' % keypath
        with open(tmp, 'w', 0600) as f:
            f.write(mon_keyring)
        try:
            os.rename(tmp, keypath)
        except OSError as e:
                raise
예제 #14
0
 def __read_resources_local(self):
     logger.debug("Start read local resources.")
     lio_api = LioAPI()
     try:
         self.__backstore = lio_api.get_backstore_image_names()
         self.__iqn_tpgs = lio_api.get_iqns_with_enabled_tpgs()
         for iqn, tpgs in self.__iqn_tpgs.iteritems():
             disk_id = str(iqn).split(":")[1]
             for tpg_index, ips in tpgs.iteritems():
                 self.__paths_local.add("/".join([disk_id, str(tpg_index)]))
                 if ips and len(ips) > 0:
                     for ip in ips:
                         self.__local_ips.add(ip)
     except Exception as e:
         logger.error("Could not read consul resources.")
         raise e
     logger.debug("End read local resources.")
예제 #15
0
 def __clean_unused_rbd_backstore(self):
     status = False
     iqns = self.__iqn_tpgs.keys()
     for rbd_backstore in self.__backstore:
         rbd_backstore_disk_id = str(rbd_backstore).replace(
             self.__image_name_prefix, "")
         is_used = False
         for iqn in iqns:
             disk_id = str(iqn).split(":")[1]
             if disk_id == rbd_backstore_disk_id:
                 is_used = True
                 break
         if not is_used:
             LioAPI().delete_backstore_image(rbd_backstore)
             logger.debug(
                 "Clean unused lio backstore {}.".format(rbd_backstore))
             status = True
     return status
예제 #16
0
    def __clean_local_path(self, path):
        disk_id, path_index = str(path).split("/")
        logger.debug("Start clean disk path {}.".format(path))
        image_name = self.__image_name_prefix + str(disk_id)
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()

        try:

            # Get iqn.
            logger.debug("Start get disk meta to clean path {}.".format(path))
            # iqn = ceph_api.get_disk_meta(disk_id, pool).iqn
            iqn = self._get_iqn_by_disk(disk_id)
            logger.debug("End get disk meta to clean path {}.".format(path))
            # Get tpgs for iqn.
            tpgs = self.__iqn_tpgs.get(iqn, None)
            if not iqn or not tpgs or len(tpgs) == 0:
                logger.info("Could not find ips for %s " % image_name)
            # Remove the assigned ips from our interfaces
            elif tpgs and len(tpgs) > 0:
                # Get assigned ips for each path.
                for tpg, ips in tpgs.iteritems():
                    if tpg == path_index:
                        for ip in ips:
                            logger.debug(
                                "Delete ip {} to clean path {}.".format(
                                    ip, path))
                            if not network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_1_eth_name):
                                network_api.delete_ip(
                                    ip, self.__cluster_info.iscsi_2_eth_name)

                        lio_api.disable_path(iqn, path_index)
                        logger.info("Cleaned disk path {}.".format(path))
                        break
        except Exception as e:
            logger.error("Could not clean disk path for %s" % image_name)
            raise e
        logger.debug("End clean disk path {}.".format(path))
        return
예제 #17
0
def build_consul():
    try:
        # Generate a Security Key
        keygen = PetaSAN.core.common.cmd.exec_command('consul keygen')[0]
        keygen = str(keygen).splitlines()[0]
        logger.debug('keygen: ' + keygen)

        conf = configuration()
        cluster_info = conf.get_cluster_info()
        cluster_name = cluster_info.name
        logger.info('cluster_name: ' + cluster_name)

        local_node_info = conf.get_node_info()
        logger.info("local_node_info.name: " + local_node_info.name)

        __create_leader_conf_locally(keygen)
        continue_building_cluster = __create_leader_conf_remotely(
            keygen, cluster_info, local_node_info)

        if continue_building_cluster is True:
            __start_leader_remotely(cluster_info, local_node_info)
            __start_leader_locally()
        else:
            logger.error('Error building Consul cluster')
            consul_status_report = StatusReport()
            consul_status_report.success = False
            consul_status_report.failed_tasks.append(
                'core_consul_deploy_build_error_build_consul_cluster')
            return consul_status_report

        # sleep(5)
        consul_status_report = __test_leaders()
        logger.debug(consul_status_report)
        return consul_status_report
    except Exception as ex:
        logger.exception(ex.message)
        consul_status_report = StatusReport()
        consul_status_report.success = False
        consul_status_report.failed_tasks.append(
            'core_consul_deploy_build_error_build_consul_cluster')
        return consul_status_report
예제 #18
0
    def __wait_before_lock(self, path=None):

        disk_id, path_index = str(path).split("/")
        wait_time = 0
        if path:
            # 1- Calc wait time if path has siblings.
            wait_time = int(self.__app_conf.get_siblings_paths_delay()) * int(
                self.__paths_per_disk_local.get(disk_id, 0))

        logger.debug("Wait time for siblings is {}.".format(wait_time))
        total_nodes = len(ConsulAPI().get_consul_members())
        # 2- Calc average paths per node.
        average_node_paths = float(
            self.__total_cluster_paths) / float(total_nodes)
        # Calc the percent of local paths according to average paths.
        percent = float(self.__paths_per_session.get(self.__session,
                                                     0)) / average_node_paths
        # 3- Calc total wait time
        if self.__last_acquire_succeeded:
            wait_time += int(
                self.__app_conf.get_average_delay_before_lock()) * percent
        else:
            logger.debug("Skipping wait time for average delay.")
        logger.debug(
            "Wait time depending on average and siblings is {}.".format(
                math.ceil(wait_time)))
        sleep(math.ceil(wait_time))
예제 #19
0
def list_devices():
    partmap = list_all_partitions()

    uuid_map = {}
    space_map = {}
    for base, parts in sorted(partmap.items()):
        for p in parts:
            dev = get_dev_path(p)
            part_name = get_dev_name(dev)
            part_uuid = get_partition_uuid(part_name)
            if part_uuid:
                uuid_map[part_uuid] = dev
            ptype = get_partition_type(part_name)
            logger.debug("main_list: " + dev +
                         " ptype = " + str(ptype) +
                         " uuid = " + str(part_uuid))
            if ptype == PTYPE['osd']:

                dev_to_mount = dev

                fs_type = get_dev_fs(dev_to_mount)
                if fs_type is not None:

                    try:
                        tpath = mount(dev=dev_to_mount,
                                      fstype=fs_type, options='noatime')
                        try:
                            for name in Space.NAMES:
                                space_uuid = get_oneliner(tpath,
                                                          name + '_uuid')
                                if space_uuid:
                                    space_map[space_uuid.lower()] = dev
                        finally:
                            unmount(tpath)
                    except Exception('Mounting filesystem failed'):
                        pass

    logger.debug("main_list: " + str(partmap) + ", uuid_map = " +
                 str(uuid_map) + ", space_map = " + str(space_map))

    devices = []
    for base, parts in sorted(partmap.items()):
        if parts:
            disk = {'path': get_dev_path(base)}
            partitions = []
            for p in sorted(parts):
                partitions.append(list_dev(get_dev_path(p),
                                           uuid_map,
                                           space_map))
            disk['partitions'] = partitions
            devices.append(disk)
        else:
            device = list_dev(get_dev_path(base), uuid_map, space_map)
            device['path'] = get_dev_path(base)
            devices.append(device)
    logger.debug("list_devices: " + str(devices))
    return devices
예제 #20
0
    def manager(self, test_type, duration_sec, threads, clients, pool,
                cleanup):

        # CephAPI().create_rados_test_pool()
        logger.debug("Benchmark manager request.")
        logger.debug(clients)
        try:
            self.type = int(test_type)
            # Duration of write and read stress test
            self.stress_duration = duration_sec / 2
            self.threads = threads
            # None storage nodes
            self.clients = clients
            # The span of time to wait between run rados test and collect state of storage nodes
            self.wait_for_collect_state = self.stress_duration / 4
            # Duration of collect node state
            self.state_duration = self.stress_duration / 2
            # pool
            self.pool = pool

            # running cleanup before the test, there will not be a cleanup file with written objects
            # will iterate for all objects in pool, very slow
            # self.__cleanup()

            nodes = ManageNode().get_node_list()
            # Get available storage nodes
            for node in nodes:
                if not node.name in clients and node.status == NodeStatus.up and node.is_storage:
                    self.storage_nodes.append(str(node.name))
                    print self.storage_nodes

            if len(self.storage_nodes) == 0 and \
                    (self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops):
                raise Exception(
                    "Cannot complete rados benchmark. No storage nodes available for run test."
                )

            logger.debug(self.storage_nodes)
            if self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops:
                self.report = BenchmarkResult()
                logger.info("Benchmark start rados write.")
                self.__write()
                logger.info("Benchmark start rados read.")
                self.__read()
                logger.info("Benchmark finished.")
                return self.report

            else:
                # TODO
                pass
        except Exception as e:
            logger.exception(e.message)

        finally:
            #CephAPI().delete_rados_test_pool()
            if cleanup:
                self.__cleanup()
예제 #21
0
    def handle_cluster_startup(self):
        i = 0
        consul_api = ConsulAPI()
        logger.debug("Check cluster startup.")
        while True:
            try:

                current_node_name = self.__node_info.name
                result = consul_api.set_leader_startup_time(
                    current_node_name, str(i))
                if i == 0 and not result:
                    sleep(2)
                    continue
                elif result:
                    # value returned, consul is up and running
                    sleep(2)
                    number_of_started_nodes = 0
                    for kv in consul_api.get_leaders_startup_times():
                        node_name = str(kv.Key).replace(
                            ConfigAPI().get_consul_leaders_path(), "")
                        if node_name != current_node_name:
                            if int(kv.Value) == 0:
                                number_of_started_nodes += 1

                    logger.debug("Number of started nodes = {}.".format(
                        number_of_started_nodes))
                    # Another management node is just starting
                    if i == 0 and number_of_started_nodes > 0:
                        logger.info(
                            "Cluster is just starting, system will delete all active disk resources"
                        )
                        consul_api.delete_disk(
                            ConfigAPI().get_consul_disks_path(), recurse=True)
                i += 1
                sleep(58)

            except Exception as ex:
                logger.debug("Start up error")
                logger.exception(ex)
                # maybe other management nodes are starting, give them a chance to start
                if i == 0:
                    sleep(2)
                else:
                    i += 1
                    sleep(58)
예제 #22
0
def __start_leader_remotely(cluster_info, local_node_info):
    logger.info('Start consul leaders remotely.')
    ssh_exec = ssh()
    for cluster_node in cluster_info.management_nodes:
        remote_node_info = NodeInfo()
        remote_node_info.load_json(json.dumps(cluster_node))

        logger.debug('local_node_info.backend_1_ip: ' +
                     local_node_info.backend_1_ip)
        logger.debug('remote_node_info.backend_1_ip: ' +
                     remote_node_info.backend_1_ip)

        if local_node_info.backend_1_ip != remote_node_info.backend_1_ip:
            logger.debug(
                'Sending: ' + 'python ' +
                ConfigAPI().get_consul_start_up_script_path() +
                ' -retry-join {} '.format(local_node_info.backend_1_ip))

            ssh_exec.exec_command(
                remote_node_info.backend_1_ip,
                'python ' + ConfigAPI().get_consul_start_up_script_path() +
                ' -retry-join {} '.format(local_node_info.backend_1_ip))
    return
예제 #23
0
    def __read(self):
        # Run rados benchmark on selected nodes
        for node in self.clients:
            cmd = "python " + ConfigAPI().get_node_stress_job_script_path(
            ) + " -d {} -t {}  -m r -p {}".format(self.stress_duration,
                                                  self.threads, self.pool)
            logger.info("Run rados read cmd on node {} : ".format(node) + cmd)
            out, err = ssh().exec_command(node, cmd)
            # get job id from output and assign to its node
            if not err:
                self.read_jobs[int(out)] = node

        logger.info("Wait time before collect node state.")
        sleep(self.wait_for_collect_state)

        # Get state of storage nodes
        for node in self.storage_nodes:
            cmd = "python " + ConfigAPI().get_storage_load_job_script_path(
            ) + " -d {} ".format(self.state_duration)
            out, err = ssh().exec_command(node, cmd)
            logger.info("Run sar state cmd on node {} : ".format(node) + cmd)
            if not err:
                self.read_jobs[int(out)] = node
        # Wait to complete all jobs
        sleep(self.stress_duration - self.wait_for_collect_state)
        # Check the completed jobs and get the output
        while (len(self.read_jobs) > 0):
            remove_job_ids = []
            for job_id, node_name in self.read_jobs.iteritems():
                cmd = "python " + ConfigAPI().get_job_info_script_path(
                ) + " -id {} -t {}".format(job_id, 1)
                out, err = ssh().exec_command(node_name, cmd)
                # Job completed
                if int(out) == 1:
                    remove_job_ids.append(job_id)
                    cmd = "python " + ConfigAPI().get_job_info_script_path(
                    ) + " -id {} -t {}".format(job_id, 2)
                    out, err = ssh().exec_command(node_name, cmd)
                    logger.debug(
                        "Get job output by cmd {} from node {} ".format(
                            cmd, node_name))
                    logger.debug("Output is {} ".format(out))
                    # job passed and get our output
                    if out.startswith(self.output_split_text) or out.find(
                            self.output_split_text) > -1:
                        out = out.split(self.output_split_text)[1]
                else:
                    continue
                # Get rados IOPs output
                if node_name in self.clients:
                    rados_rs = RadosResult()
                    if out:
                        rados_rs.load_json(out)
                        self.report.read_iops += rados_rs.iops
                        self.report.read_throughput += rados_rs.throughput
                elif node_name in self.storage_nodes:
                    # Get sar output
                    sar_rs = SarResult()
                    if out:
                        sar_rs.load_json(out)
                        self.report.read_nodes.append(sar_rs)

            # Remove completed jobs
            for i in remove_job_ids:
                self.read_jobs.pop(i)
            if len(self.read_jobs) > 0:
                sleep(5)
예제 #24
0
    def process(self):
        logger.info("Start process reassignments paths.")
        max_retry = 100
        current_reassignments = self.get_current_reassignment()
        config = configuration()
        assignment_script_path = ConfigAPI().get_assignment_script_path()
        if current_reassignments is None:
            return
        for ip, path_assignment_info in current_reassignments.iteritems():
            logger.info("process path {} and its status is {}".format(
                ip, path_assignment_info.status))
            if path_assignment_info.status == ReassignPathStatus.pending:
                logger.info(
                    "Move action,try clean disk {} path {} remotely on node {}."
                    .format(path_assignment_info.disk_name,
                            path_assignment_info.disk_id,
                            path_assignment_info.node))

                status = False
                try:

                    cmd = "python {} path_host -ip {} -disk_id {}".format(
                        assignment_script_path, path_assignment_info.ip,
                        path_assignment_info.disk_id)
                    out, err = ssh().exec_command(path_assignment_info.node,
                                                  cmd)
                    logger.info(cmd)
                    # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id)
                except Exception as ex:
                    logger.exception(ex.message)
                    out = ""

                if str(out).strip() == "0":
                    logger.info("Move action passed")
                    status = True

                current_path_assignment_info = None
                if status:
                    for i in xrange(0, max_retry):
                        logger.debug(
                            "Wait to update status of path {}.".format(
                                path_assignment_info.ip))
                        sleep(0.25)
                        reassignments = self.get_current_reassignment()
                        if reassignments:
                            current_path_assignment_info = reassignments.get(
                                path_assignment_info.ip)
                            if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                                continue
                            else:
                                logger.info(
                                    "Process completed for path {} with status {}."
                                    .format(
                                        current_path_assignment_info.ip,
                                        current_path_assignment_info.status))
                                break
                    if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving:
                        self.update_path(current_path_assignment_info,
                                         ReassignPathStatus.failed)
                        logger.info(
                            "Move action,failed ,disk {} path {}.".format(
                                path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))

                else:
                    self.update_path(path_assignment_info,
                                     ReassignPathStatus.failed)
                    logger.info(
                        "Move action ,failed to clean disk {} path {} remotely on node ."
                        .format(path_assignment_info.disk_name,
                                path_assignment_info.disk_id,
                                path_assignment_info.node))
        sleep(10)  # wait for display status to user if needed
        logger.info("Process completed.")
        self.remove_assignment()
        ConsulAPI().drop_all_node_sessions(
            self.__app_conf.get_consul_assignment_path(),
            config.get_node_name())
예제 #25
0
    def __read_resources_consul(self):
        logger.debug("Start read resources consul.")
        self.__paths_per_session = {}
        self.__total_cluster_paths = 0
        unlock_kvs = set()
        consul_api = ConsulAPI()
        try:
            disk_kvs = consul_api.get_disk_kvs()
            for kv in disk_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                disk_id = str(key).split('/')[0]
                if disk_id in self.__disk_consul_stopped:
                    continue
                if kv.Value == "disk":
                    disk_id = str(key).split('/')[0]
                    self.__paths_per_disk_local[disk_id] = 0
                    if str(kv.Flags) == "1":
                        self.__disk_consul_stopped.add(disk_id)
                    continue
                # Count paths in the cluster.
                self.__total_cluster_paths += 1

                if hasattr(kv, "Session"):
                    # locked paths
                    if kv.Session == self.__session:
                        self.__paths_consul_locked_node.add(key)
                        disk_paths_count = self.__paths_per_disk_local.get(
                            disk_id, 0) + 1
                        self.__paths_per_disk_local[disk_id] = disk_paths_count
                    # Total count of paths for each session
                    if self.__paths_per_session.has_key(kv.Session):
                        count = self.__paths_per_session.get(kv.Session)
                        self.__paths_per_session[kv.Session] = count + 1
                    else:
                        self.__paths_per_session[kv.Session] = 1
                # unlocked paths
                elif not hasattr(kv, "Session"):
                    unlock_kvs.add(kv)
            # Filter unlocked paths
            reassignments = None
            if len(unlock_kvs) > 0:
                reassignments = MangePathAssignment().get_forced_paths()
            for kv in unlock_kvs:
                key = str(kv.Key).replace(
                    self.__app_conf.get_consul_disks_path(), "")
                if reassignments:
                    path_assignment_info = reassignments.get(key)
                    if path_assignment_info and path_assignment_info.target_node == self.__node_info.name:
                        self.__force_acquire_paths[key] = kv
                        continue
                    else:
                        self.__ignored_acquire_paths[key] = kv
                        continue

                disk_id = str(key).split('/')[0]
                if self.__paths_per_disk_local.get(disk_id, 0) > 0:
                    self.__paths_consul_unlocked_siblings[key] = kv
                else:
                    self.__paths_consul_unlocked_firstborn[key] = kv
        except Exception as e:
            logger.error("Could not read consul resources.")
            logger.exception(e)
            raise e
        logger.debug("End read resources consul.")
예제 #26
0
    def __acquire_path(self, path, consul_kv):
        if self.__ignored_acquire_paths.get(path):
            logger.info("Ignore forced path {}".format(path))
            return
        logger.debug("Start acquire path {} by node session {}.".format(
            path, self.__session))
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        config = configuration()
        try:
            disk_id, path_index = str(path).split("/")
            pool = self._get_pool_by_disk(disk_id)
            if not pool:
                logger.error('Could not find pool for disk ' + disk_id)
                return
            image_name = self.__image_name_prefix + disk_id
            logger.debug(
                "Start read image meta for acquire path {}.".format(path))
            all_image_meta = ceph_api.read_image_metadata(image_name, pool)
            petasan_meta = all_image_meta.get(
                self.__app_conf.get_image_meta_key())
            disk_meta = DiskMeta()
            disk_meta.load_json(petasan_meta)
            logger.debug(
                "End read image meta for acquire path {}.".format(path))

            logger.debug("Try to acquire path {}.".format(path))
            node_name = config.get_node_name()
            result = consul_api.lock_disk_path(
                self.__app_conf.get_consul_disks_path() + path, self.__session,
                node_name, str(consul_kv.CreateIndex))
            if not result:
                logger.info("Could not lock path {} with session {}.".format(
                    path, self.__session))
            elif result:
                if consul_kv.Value != None and len(str(
                        consul_kv.Value)) > 0 and node_name != str(
                            consul_kv.Value):
                    logger.info("The path {} was locked by {}.".format(
                        path, str(consul_kv.Value)))
                    logger.debug("Node {} will kill node {}.".format(
                        config.get_node_name(), str(consul_kv.Value)))
                    self.__fencing(str(consul_kv.Value))

                # we locked it
                if disk_meta.paths:
                    # if lio has the image name in its backstore already, do not perform rbd mapping
                    if image_name not in self.__backstore:
                        status = ceph_api.map_iamge(image_name, pool)
                    else:
                        status = Status.done
                    if Status.done == status:
                        # Get path info from metadata
                        path_obj = disk_meta.get_paths()[int(path_index) - 1]
                        # add path ips to our network interfaces
                        network_api.add_ip(path_obj.ip, path_obj.subnet_mask,
                                           path_obj.eth, path_obj.vlan_id)
                        #update neighbors arp table
                        network_api.update_neighbors_arp(
                            path_obj.ip, path_obj.eth)
                        # add new target in lio if not there already
                        if not lio_api.is_backstore_image_found(image_name):
                            # Give ceph map image complete it job
                            sleep(3)
                            # Add rbd backstores and target
                            status = lio_api.add_target(
                                disk_meta, disk_meta.pool)
                            """
                            wwn = self.calculate_disk_wwn(disk_meta)
                            status = lio_api.add_target(disk_meta, wwn, disk_meta.pool)
                            """
                        if Status.done == status:
                            # enable the path we locked to true
                            self.__last_acquire_succeeded = True
                            lio_api.enable_path(disk_meta.iqn, path_index,
                                                True)
                            logger.info("Path %s acquired successfully" % path)

                            if self.__acquire_warning_counter > 2:
                                logger.info(
                                    "PetaSAN finally succeeded to acquire path after retrying {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter = 0
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)

                            if path_assignment_info:
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.succeeded)
                        else:
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)
                            if path_assignment_info:
                                logger.info(
                                    "Acquired forced path {}".format(path))
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.failed)
                            self.__last_acquire_succeeded = False
                            if self.__acquire_warning_counter > 2:
                                logger.warning(
                                    "PetaSAN failed to acquire path after {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter += 1
                            logger.error("Error could not acquire path %s" %
                                         path)

                    else:
                        self.__unlock_consul_path(path)

        except Exception as e:
            logger.info("---------------------------------")
            logger.error(str(e.message) + "\n")
            logger.exception(e)

            if str(e.message).find("invalid session") > -1:
                logger.error("Session is invalid")
                try:
                    logger.info("Trying to create new session id")
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)
                    logger.info("New session id is {}".format(self.__session))
                    logger.info("Cleaning all mapped disks from old session")
                    self.__clean()
                except Exception as ex:
                    logger.exception(ex)
            logger.exception("Could not acquire path %s" % path)
            raise e
        logger.debug("End acquire path {}.".format(path))
        return