def connect(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        rsp = ConnectRsp()
        diskPaths = set()

        def config_lvm(host_id, enableLvmetad=False):
            lvm.backup_lvm_config()
            lvm.reset_lvm_conf_default()
            lvm.config_lvm_by_sed("use_lvmlockd", "use_lvmlockd=1", ["lvm.conf", "lvmlocal.conf"])
            if enableLvmetad:
                lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=1", ["lvm.conf", "lvmlocal.conf"])
            else:
                lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=0", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("host_id", "host_id=%s" % host_id, ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("sanlock_lv_extend", "sanlock_lv_extend=%s" % DEFAULT_SANLOCK_LV_SIZE, ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("lvmlockd_lock_retries", "lvmlockd_lock_retries=6", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("issue_discards", "issue_discards=1", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("reserved_stack", "reserved_stack=256", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("reserved_memory", "reserved_memory=131072", ["lvm.conf", "lvmlocal.conf"])

            lvm.config_lvm_filter(["lvm.conf", "lvmlocal.conf"])

            lvm.config_sanlock_by_sed("sh_retries", "sh_retries=20")
            lvm.config_sanlock_by_sed("logfile_priority", "logfile_priority=7")
            lvm.config_sanlock_by_sed("renewal_read_extend_sec", "renewal_read_extend_sec=24")
            lvm.config_sanlock_by_sed("debug_renew", "debug_renew=1")
            lvm.config_sanlock_by_sed("use_watchdog", "use_watchdog=0")

            sanlock_hostname = "%s-%s-%s" % (cmd.vgUuid[:8], cmd.hostUuid[:8], bash.bash_o("hostname").strip()[:20])
            lvm.config_sanlock_by_sed("our_host_name", "our_host_name=%s" % sanlock_hostname)

        config_lvm(cmd.hostId, cmd.enableLvmetad)
        for diskUuid in cmd.sharedBlockUuids:
            disk = CheckDisk(diskUuid)
            diskPaths.add(disk.get_path())
        lvm.start_lvmlockd()
        lvm.check_gl_lock()
        logger.debug("find/create vg %s lock..." % cmd.vgUuid)
        rsp.isFirst = self.create_vg_if_not_found(cmd.vgUuid, diskPaths, cmd.hostUuid, cmd.forceWipe)

        lvm.check_stuck_vglk()
        logger.debug("starting vg %s lock..." % cmd.vgUuid)
        lvm.start_vg_lock(cmd.vgUuid)

        if lvm.lvm_vgck(cmd.vgUuid, 60)[0] is False and lvm.lvm_check_operation(cmd.vgUuid) is False:
            lvm.drop_vg_lock(cmd.vgUuid)
            logger.debug("restarting vg %s lock..." % cmd.vgUuid)
            lvm.check_gl_lock()
            lvm.start_vg_lock(cmd.vgUuid)

        lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG)
        lvm.add_vg_tag(cmd.vgUuid, "%s::%s::%s::%s" % (HEARTBEAT_TAG, cmd.hostUuid, time.time(), bash.bash_o('hostname').strip()))
        self.clear_stalled_qmp_socket()

        rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid)
        rsp.hostId = lvm.get_running_host_id(cmd.vgUuid)
        rsp.vgLvmUuid = lvm.get_vg_lvm_uuid(cmd.vgUuid)
        rsp.hostUuid = cmd.hostUuid
        return jsonobject.dumps(rsp)
    def connect(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        rsp = ConnectRsp()
        diskPaths = set()

        def config_lvm(host_id):
            lvm.backup_lvm_config()
            lvm.reset_lvm_conf_default()
            lvm.config_lvm_by_sed("use_lvmlockd", "use_lvmlockd=1", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=0", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("host_id", "host_id=%s" % host_id, ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("sanlock_lv_extend", "sanlock_lv_extend=%s" % DEFAULT_SANLOCK_LV_SIZE, ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("lvmlockd_lock_retries", "lvmlockd_lock_retries=6", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("issue_discards", "issue_discards=1", ["lvm.conf", "lvmlocal.conf"])
            lvm.config_lvm_by_sed("reserved_memory", "reserved_memory=65536", ["lvm.conf", "lvmlocal.conf"])

            lvm.config_sanlock_by_sed("sh_retries", "sh_retries=20")
            lvm.config_sanlock_by_sed("logfile_priority", "logfile_priority=7")
            lvm.config_sanlock_by_sed("renewal_read_extend_sec", "renewal_read_extend_sec=24")
            lvm.config_sanlock_by_sed("debug_renew", "debug_renew=1")
            lvm.config_sanlock_by_sed("use_watchdog", "use_watchdog=0")

            sanlock_hostname = "%s-%s-%s" % (cmd.vgUuid[:8], cmd.hostUuid[:8], bash.bash_o("hostname").strip()[:20])
            lvm.config_sanlock_by_sed("our_host_name", "our_host_name=%s" % sanlock_hostname)

        config_lvm(cmd.hostId)
        for diskUuid in cmd.sharedBlockUuids:
            disk = CheckDisk(diskUuid)
            diskPaths.add(disk.get_path())
        lvm.start_lvmlockd()
        lvm.check_gl_lock()
        logger.debug("find/create vg %s lock..." % cmd.vgUuid)
        rsp.isFirst = self.create_vg_if_not_found(cmd.vgUuid, diskPaths, cmd.hostUuid, cmd.forceWipe)

        lvm.check_stuck_vglk()
        logger.debug("starting vg %s lock..." % cmd.vgUuid)
        lvm.start_vg_lock(cmd.vgUuid)

        if lvm.lvm_vgck(cmd.vgUuid, 15)[0] is False:
            lvm.drop_vg_lock(cmd.vgUuid)
            logger.debug("restarting vg %s lock..." % cmd.vgUuid)
            lvm.check_gl_lock()
            lvm.start_vg_lock(cmd.vgUuid)

        lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG)
        lvm.add_vg_tag(cmd.vgUuid, "%s::%s::%s::%s" % (HEARTBEAT_TAG, cmd.hostUuid, time.time(), bash.bash_o('hostname').strip()))

        rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid)
        rsp.hostId = lvm.get_running_host_id(cmd.vgUuid)
        return jsonobject.dumps(rsp)
Example #3
0
        def heartbeat_on_sharedblock():
            fire = 0
            failure = 0

            while self.run_fencer(cmd.vgUuid, created_time):
                try:
                    time.sleep(cmd.interval)
                    global last_multipath_run
                    if cmd.fail_if_no_path and time.time(
                    ) - last_multipath_run > 3600:
                        last_multipath_run = time.time()
                        thread.ThreadFacade.run_in_thread(
                            linux.set_fail_if_no_path)

                    health = lvm.check_vg_status(cmd.vgUuid,
                                                 cmd.storageCheckerTimeout,
                                                 check_pv=False)
                    logger.debug(
                        "sharedblock group primary storage %s fencer run result: %s"
                        % (cmd.vgUuid, health))
                    if health[0] is True:
                        fire = 0
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    if self.fencer_fire_timestamp.get(cmd.vgUuid) is not None and \
                            time.time() > self.fencer_fire_timestamp.get(cmd.vgUuid) and \
                            time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid) < (300 * (fire + 1 if fire < 10 else 10)):
                        logger.warn(
                            "last fencer fire: %s, now: %s, passed: %s seconds, within %s seconds, skip fire",
                            self.fencer_fire_timestamp[cmd.vgUuid],
                            time.time(),
                            time.time() -
                            self.fencer_fire_timestamp.get(cmd.vgUuid),
                            300 * (fire + 1 if fire < 10 else 10))
                        failure = 0
                        continue

                    self.fencer_fire_timestamp[cmd.vgUuid] = time.time()
                    try:
                        logger.warn("shared block storage %s fencer fired!" %
                                    cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid],
                                                   'Disconnected', health[1])
                        fire += 1

                        if cmd.strategy == 'Permissive':
                            continue

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(
                            cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(
                            cmd.vgUuid, invalid_pv_uuids, True)
                        killed_vm_uuids = []
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' %
                                    (vm.uuid, vm.pid, cmd.maxAttempts))
                                killed_vm_uuids.append(vm.uuid)
                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s'
                                    % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug(
                                            "deactivate volume %s for vm %s failed, %s"
                                            % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug(
                                        "volume %s still used: %s, skip to deactivate"
                                        % (volume, used_process))

                        if len(killed_vm_uuids) != 0:
                            self.report_self_fencer_triggered(
                                [cmd.vgUuid], ','.join(killed_vm_uuids))
                            clean_network_config(killed_vm_uuids)

                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid,
                                               cmd.storageCheckerTimeout,
                                               True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)
                    finally:
                        failure = 0

                except Exception as e:
                    logger.debug(
                        'self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...'
                        % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            if not self.run_fencer(cmd.vgUuid, created_time):
                logger.debug(
                    'stop self-fencer on sharedblock primary storage %s for judger failed'
                    % cmd.vgUuid)
            else:
                logger.warn(
                    'stop self-fencer on sharedblock primary storage %s' %
                    cmd.vgUuid)
Example #4
0
        def heartbeat_on_sharedblock():
            failure = 0

            while self.run_sharedblock_fencer[cmd.vgUuid] is True:
                try:
                    time.sleep(cmd.interval)

                    health = lvm.check_vg_status(cmd.vgUuid,
                                                 cmd.storageCheckerTimeout,
                                                 check_pv=False)
                    logger.debug(
                        "sharedblock group primary storage %s fencer run result: %s"
                        % (cmd.vgUuid, health))
                    if health[0] is True:
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    try:
                        logger.warn("shared block storage %s fencer fired!" %
                                    cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid],
                                                   'Disconnected', health[1])

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(
                            cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(
                            cmd.vgUuid, invalid_pv_uuids, cmd.checkIo)
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' %
                                    (vm.uuid, vm.pid, cmd.maxAttempts))

                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s'
                                    % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug(
                                            "deactivate volume %s for vm %s failed, %s"
                                            % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug(
                                        "volume %s still used: %s, skip to deactivate"
                                        % (volume, used_process))

                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid,
                                               cmd.storageCheckerTimeout,
                                               True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                        # reset the failure count
                        failure = 0
                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)

                except Exception as e:
                    logger.debug(
                        'self-fencer on sharedblock primary storage %s stopped abnormally'
                        % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            logger.debug('stop self-fencer on sharedblock primary storage %s' %
                         cmd.vgUuid)
Example #5
0
        def heartbeat_on_sharedblock():
            failure = 0

            while self.run_fencer(cmd.vgUuid, created_time):
                try:
                    time.sleep(cmd.interval)

                    health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False)
                    logger.debug("sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health))
                    if health[0] is True:
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    try:
                        logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1])

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(cmd.vgUuid, invalid_pv_uuids, True)
                        killed_vm_uuids = []
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts))
                                killed_vm_uuids.append(vm.uuid)
                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug("deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug("volume %s still used: %s, skip to deactivate" % (volume, used_process))

                        if len(killed_vm_uuids) != 0:
                            self.report_self_fencer_triggered([cmd.vgUuid], ','.join(killed_vm_uuids))
                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                        # reset the failure count
                        failure = 0
                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)

                except Exception as e:
                    logger.debug('self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            if not self.run_fencer(cmd.vgUuid, created_time):
                logger.debug('stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid)
            else:
                logger.warn('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)