Exemple #1
0
    def ping(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        if cmd.uuid not in self.mount_path.keys():
            self.mount_path[cmd.uuid] = cmd.mountPath

        mount_path = self.mount_path[cmd.uuid]
        # if nfs service stop, os.path.isdir will hung
        if not linux.timeout_isdir(mount_path) or not linux.is_mounted(
                path=mount_path):
            raise Exception(
                'the mount path[%s] of the nfs primary storage[uuid:%s] is not existing'
                % (mount_path, cmd.uuid))

        test_file = os.path.join(mount_path,
                                 '%s-ping-test-file' % uuidhelper.uuid())
        touch = shell.ShellCmd('timeout 60 touch %s' % test_file)
        touch(False)
        if touch.return_code == 124:
            raise Exception(
                'unable to access the mount path[%s] of the nfs primary storage[uuid:%s] in 60s, timeout'
                % (mount_path, cmd.uuid))
        elif touch.return_code != 0:
            touch.raise_error()

        linux.rm_file_force(test_file)
        return jsonobject.dumps(NfsResponse())
Exemple #2
0
    def connect(self, req):
        none_shared_mount_fs_type = [
            'xfs', 'ext2', 'ext3', 'ext4', 'vfat', 'tmpfs', 'btrfs'
        ]
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        if not linux.timeout_isdir(cmd.mountPoint):
            raise kvmagent.KvmError(
                '%s is not a directory, the mount point seems not setup' %
                cmd.mountPoint)

        folder_fs_type = shell.call("df -T %s|tail -1|awk '{print $2}'" %
                                    cmd.mountPoint).strip()
        if folder_fs_type in none_shared_mount_fs_type:
            raise kvmagent.KvmError(
                '%s filesystem is %s, which is not a shared mount point type.'
                % (cmd.mountPoint, folder_fs_type))

        id_dir = os.path.join(cmd.mountPoint, "zstack_smp_id_file")
        shell.call("mkdir -p %s" % id_dir)
        lock_file = os.path.join(id_dir, "uuid.lock")

        @lock.file_lock(lock_file, locker=lock.Flock())
        def check_other_smp_and_set_id_file(uuid, existUuids):
            o = shell.ShellCmd('''\
            ls %s | grep -v %s | grep -o "[0-9a-f]\{8\}[0-9a-f]\{4\}[1-5][0-9a-f]\{3\}[89ab][0-9a-f]\{3\}[0-9a-f]\{12\}"\
            ''' % (id_dir, uuid))
            o(False)
            if o.return_code != 0:
                file_uuids = []
            else:
                file_uuids = o.stdout.splitlines()

            for file_uuid in file_uuids:
                if file_uuid in existUuids:
                    raise Exception(
                        "the mount point [%s] has been occupied by other SMP[uuid:%s], Please attach this directly"
                        % (cmd.mountPoint, file_uuid))

            logger.debug("existing id files: %s" % file_uuids)
            self.id_files[uuid] = os.path.join(id_dir, uuid)

            if not os.path.exists(self.id_files[uuid]):
                # check if hosts in the same cluster mount the same path but different storages.
                rsp.isFirst = True
                for file_uuid in file_uuids:
                    linux.rm_file_force(os.path.join(id_dir, file_uuid))
                linux.touch_file(self.id_files[uuid])
                linux.sync()

        rsp = ConnectRsp()
        check_other_smp_and_set_id_file(cmd.uuid, cmd.existUuids)

        rsp.totalCapacity, rsp.availableCapacity = self._get_disk_capacity(
            cmd.mountPoint)
        return jsonobject.dumps(rsp)
Exemple #3
0
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        self.run_filesystem_fencer = True

        @thread.AsyncThread
        def heartbeat_file_fencer(mount_path, ps_uuid):
            heartbeat_file_path = os.path.join(
                mount_path, 'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            try:
                failure = 0

                while self.run_filesystem_fencer:
                    time.sleep(cmd.interval)

                    touch = shell.ShellCmd(
                        'timeout %s touch %s; exit $?' %
                        (cmd.storageCheckerTimeout, heartbeat_file_path))
                    touch(False)
                    if touch.return_code == 0:
                        failure = 0
                        continue

                    logger.warn(
                        'unable to touch %s, %s %s' %
                        (heartbeat_file_path, touch.stderr, touch.stdout))
                    failure += 1

                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                            'shutdown ourselves' %
                            (heartbeat_file_path, cmd.maxAttempts))
                        self.report_storage_status([ps_uuid], 'Disconnected')
                        kill_vm(cmd.maxAttempts, [mount_path], True)
                        break

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' %
                             heartbeat_file_path)
            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_point, uuid in zip(cmd.mountPoints, cmd.uuids):
            if not linux.timeout_isdir(mount_point):
                raise Exception('the mount point[%s] is not a directory' %
                                mount_point)

            heartbeat_file_fencer(mount_point, uuid)

        return jsonobject.dumps(AgentRsp())
    def connect(self, req):
        none_shared_mount_fs_type = ['xfs', 'ext2', 'ext3', 'ext4', 'vfat', 'tmpfs', 'btrfs']
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        if not linux.timeout_isdir(cmd.mountPoint):
            raise kvmagent.KvmError('%s is not a directory, the mount point seems not setup' % cmd.mountPoint)

        folder_fs_type = shell.call("df -T %s|tail -1|awk '{print $2}'" % cmd.mountPoint).strip()
        if folder_fs_type in none_shared_mount_fs_type:
            raise kvmagent.KvmError(
                '%s filesystem is %s, which is not a shared mount point type.' % (cmd.mountPoint, folder_fs_type))

        id_dir = os.path.join(cmd.mountPoint, "zstack_smp_id_file")
        shell.call("mkdir -p %s" % id_dir)
        lock_file = os.path.join(id_dir, "uuid.lock")

        @lock.file_lock(lock_file, locker=lock.Flock())
        def check_other_smp_and_set_id_file(uuid, existUuids):
            o = shell.ShellCmd('''\
            ls %s | grep -v %s | grep -o "[0-9a-f]\{8\}[0-9a-f]\{4\}[1-5][0-9a-f]\{3\}[89ab][0-9a-f]\{3\}[0-9a-f]\{12\}"\
            ''' % (id_dir, uuid))
            o(False)
            if o.return_code != 0:
                file_uuids = []
            else:
                file_uuids = o.stdout.splitlines()

            for file_uuid in file_uuids:
                if file_uuid in existUuids:
                    raise Exception(
                        "the mount point [%s] has been occupied by other SMP[uuid:%s], Please attach this directly"
                        % (cmd.mountPoint, file_uuid))

            logger.debug("existing id files: %s" % file_uuids)
            self.id_files[uuid] = os.path.join(id_dir, uuid)

            if not os.path.exists(self.id_files[uuid]):
                # check if hosts in the same cluster mount the same path but different storages.
                rsp.isFirst = True
                for file_uuid in file_uuids:
                    linux.rm_file_force(os.path.join(id_dir, file_uuid))
                linux.touch_file(self.id_files[uuid])
                linux.sync()

        rsp = ConnectRsp()
        check_other_smp_and_set_id_file(cmd.uuid, cmd.existUuids)

        rsp.totalCapacity, rsp.availableCapacity = self._get_disk_capacity(cmd.mountPoint)
        return jsonobject.dumps(rsp)
    def ping(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        mount_path = self.mount_path[cmd.uuid]
        # if nfs service stop, os.path.isdir will hung
        if not linux.timeout_isdir(mount_path) or not linux.is_mounted(path=mount_path):
            raise Exception('the mount path[%s] of the nfs primary storage[uuid:%s] is not existing' % (mount_path, cmd.uuid))

        test_file = os.path.join(mount_path, '%s-ping-test-file' % uuidhelper.uuid())
        touch = shell.ShellCmd('timeout 60 touch %s' % test_file)
        touch(False)
        if touch.return_code == 124:
            raise Exception('unable to access the mount path[%s] of the nfs primary storage[uuid:%s] in 60s, timeout' %
                            (mount_path, cmd.uuid))
        elif touch.return_code != 0:
            touch.raise_error()

        linux.rm_file_force(test_file)
        return jsonobject.dumps(NfsResponse())
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        @thread.AsyncThread
        def heartbeat_file_fencer(mount_path, ps_uuid, mounted_by_zstack):
            def try_remount_fs():
                if mount_path_is_nfs(mount_path):
                    shell.run("systemctl start nfs-client.target")

                while self.run_filesystem_fencer(ps_uuid, created_time):
                    if linux.is_mounted(
                            path=mount_path) and touch_heartbeat_file():
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug(
                            "fs[uuid:%s] is reachable again, report to management"
                            % ps_uuid)
                        break
                    try:
                        logger.debug(
                            'fs[uuid:%s] is unreachable, it will be remounted after 180s'
                            % ps_uuid)
                        time.sleep(180)
                        if not self.run_filesystem_fencer(
                                ps_uuid, created_time):
                            break
                        linux.remount(url, mount_path, options)
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug(
                            "remount fs[uuid:%s] success, report to management"
                            % ps_uuid)
                        break
                    except:
                        logger.warn(
                            'remount fs[uuid:%s] fail, try again soon' %
                            ps_uuid)
                        kill_progresses_using_mount_path(mount_path)

                logger.debug('stop remount fs[uuid:%s]' % ps_uuid)

            def after_kill_vm():
                if not killed_vm_pids or not mounted_by_zstack:
                    return

                try:
                    kill_and_umount(mount_path, mount_path_is_nfs(mount_path))
                except UmountException:
                    if shell.run('ps -p %s' % ' '.join(killed_vm_pids)) == 0:
                        virsh_list = shell.call(
                            "timeout 10 virsh list --all || echo 'cannot obtain virsh list'"
                        )
                        logger.debug("virsh_list:\n" + virsh_list)
                        logger.error(
                            'kill vm[pids:%s] failed because of unavailable fs[mountPath:%s].'
                            ' please retry "umount -f %s"' %
                            (killed_vm_pids, mount_path, mount_path))
                        return

                try_remount_fs()

            def touch_heartbeat_file():
                touch = shell.ShellCmd(
                    'timeout %s touch %s' %
                    (cmd.storageCheckerTimeout, heartbeat_file_path))
                touch(False)
                if touch.return_code != 0:
                    logger.warn(
                        'unable to touch %s, %s %s' %
                        (heartbeat_file_path, touch.stderr, touch.stdout))
                return touch.return_code == 0

            heartbeat_file_path = os.path.join(
                mount_path, 'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            created_time = time.time()
            with self.fencer_lock:
                self.run_filesystem_fencer_timestamp[ps_uuid] = created_time
            try:
                failure = 0
                url = shell.call("mount | grep -e '%s' | awk '{print $1}'" %
                                 mount_path).strip()
                options = shell.call(
                    "mount | grep -e '%s' | awk -F '[()]' '{print $2}'" %
                    mount_path).strip()

                while self.run_filesystem_fencer(ps_uuid, created_time):
                    time.sleep(cmd.interval)
                    if touch_heartbeat_file():
                        failure = 0
                        continue

                    failure += 1
                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                            'shutdown ourselves' %
                            (heartbeat_file_path, cmd.maxAttempts))
                        self.report_storage_status([ps_uuid], 'Disconnected')
                        killed_vm_pids = kill_vm(cmd.maxAttempts, [mount_path],
                                                 True)
                        after_kill_vm()

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' %
                             heartbeat_file_path)

            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_path, uuid, mounted_by_zstack in zip(cmd.mountPaths,
                                                       cmd.uuids,
                                                       cmd.mountedByZStack):
            if not linux.timeout_isdir(mount_path):
                raise Exception('the mount path[%s] is not a directory' %
                                mount_path)

            heartbeat_file_fencer(mount_path, uuid, mounted_by_zstack)

        return jsonobject.dumps(AgentRsp())
Exemple #7
0
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        self.run_filesystem_fencer = True

        @thread.AsyncThread
        def heartbeat_file_fencer(heartbeat_file_path, ps_uuid):
            try:
                failure = 0

                while self.run_filesystem_fencer:
                    time.sleep(cmd.interval)

                    touch = shell.ShellCmd(
                        'timeout %s touch %s; exit $?' %
                        (cmd.storageCheckerTimeout, heartbeat_file_path))
                    touch(False)
                    if touch.return_code == 0:
                        failure = 0
                        continue

                    logger.warn(
                        'unable to touch %s, %s %s' %
                        (heartbeat_file_path, touch.stderr, touch.stdout))
                    failure += 1

                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                            'shutdown ourselves' %
                            (heartbeat_file_path, cmd.maxAttempts))
                        mountPath = (os.path.split(heartbeat_file_path))[0]
                        self.report_storage_status([ps_uuid], 'Disconnected')
                        kill_vm(cmd.maxAttempts, mountPath, True)

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' %
                             heartbeat_file_path)
            except:
                content = traceback.format_exc()
                logger.warn(content)

        gateway = cmd.storageGateway
        if not gateway:
            gateway = linux.get_gateway_by_default_route()

        @thread.AsyncThread
        def storage_gateway_fencer(gw):
            failure = 0

            try:
                while self.run_filesystem_fencer:
                    time.sleep(cmd.interval)

                    ping = shell.ShellCmd(
                        "nmap -sP -PI %s | grep 'Host is up'" % gw)
                    ping(False)
                    if ping.return_code == 0:
                        failure = 0
                        continue

                    logger.warn(
                        'unable to ping the storage gateway[%s], %s %s' %
                        (gw, ping.stderr, ping.stdout))
                    failure += 1

                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to ping storage gateway[%s] %s times, we lost connection to the storage,'
                            'shutdown ourselves' % (gw, cmd.maxAttempts))
                        self.report_storage_status(cmd.psUuids, 'Disconnected')
                        kill_vm(cmd.maxAttempts)

                logger.debug(
                    'stop gateway[%s] fencer for filesystem self-fencer' % gw)
            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_point, uuid in zip(cmd.mountPoints, cmd.uuids):
            if not linux.timeout_isdir(mount_point):
                raise Exception('the mount point[%s] is not a directory' %
                                mount_point)

            hb_file = os.path.join(
                mount_point, 'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            heartbeat_file_fencer(hb_file, uuid)

        if gateway:
            storage_gateway_fencer(gateway)
        else:
            logger.warn(
                'cannot find storage gateway, unable to setup storage gateway fencer'
            )

        return jsonobject.dumps(AgentRsp())
Exemple #8
0
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        @thread.AsyncThread
        def heartbeat_file_fencer(mount_path, ps_uuid, mounted_by_zstack, url,
                                  options):
            def try_remount_fs():
                if mount_path_is_nfs(mount_path):
                    shell.run("systemctl start nfs-client.target")

                while self.run_fencer(ps_uuid, created_time):
                    if linux.is_mounted(
                            path=mount_path) and touch_heartbeat_file():
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug(
                            "fs[uuid:%s] is reachable again, report to management"
                            % ps_uuid)
                        break
                    try:
                        logger.debug(
                            'fs[uuid:%s] is unreachable, it will be remounted after 180s'
                            % ps_uuid)
                        time.sleep(180)
                        if not self.run_fencer(ps_uuid, created_time):
                            break
                        linux.remount(url, mount_path, options)
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug(
                            "remount fs[uuid:%s] success, report to management"
                            % ps_uuid)
                        break
                    except:
                        logger.warn(
                            'remount fs[uuid:%s] fail, try again soon' %
                            ps_uuid)
                        kill_progresses_using_mount_path(mount_path)

                logger.debug('stop remount fs[uuid:%s]' % ps_uuid)

            def after_kill_vm():
                if not killed_vm_pids or not mounted_by_zstack:
                    return

                try:
                    kill_and_umount(mount_path, mount_path_is_nfs(mount_path))
                except UmountException:
                    if shell.run('ps -p %s' % ' '.join(killed_vm_pids)) == 0:
                        virsh_list = shell.call(
                            "timeout 10 virsh list --all || echo 'cannot obtain virsh list'"
                        )
                        logger.debug("virsh_list:\n" + virsh_list)
                        logger.error(
                            'kill vm[pids:%s] failed because of unavailable fs[mountPath:%s].'
                            ' please retry "umount -f %s"' %
                            (killed_vm_pids, mount_path, mount_path))
                        return

            def touch_heartbeat_file():
                touch = shell.ShellCmd(
                    'timeout %s touch %s' %
                    (cmd.storageCheckerTimeout, heartbeat_file_path))
                touch(False)
                if touch.return_code != 0:
                    logger.warn(
                        'unable to touch %s, %s %s' %
                        (heartbeat_file_path, touch.stderr, touch.stdout))
                return touch.return_code == 0

            def prepare_heartbeat_dir():
                heartbeat_dir = os.path.join(mount_path, "zs-heartbeat")
                if not mounted_by_zstack or linux.is_mounted(mount_path):
                    if not os.path.exists(heartbeat_dir):
                        os.makedirs(heartbeat_dir, 0755)
                else:
                    if os.path.exists(heartbeat_dir):
                        linux.rm_dir_force(heartbeat_dir)
                return heartbeat_dir

            heartbeat_file_dir = prepare_heartbeat_dir()
            heartbeat_file_path = os.path.join(
                heartbeat_file_dir,
                'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            created_time = time.time()
            self.setup_fencer(ps_uuid, created_time)
            try:
                failure = 0
                while self.run_fencer(ps_uuid, created_time):
                    time.sleep(cmd.interval)
                    if touch_heartbeat_file():
                        failure = 0
                        continue

                    failure += 1
                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                            'shutdown ourselves' %
                            (heartbeat_file_path, cmd.maxAttempts))
                        self.report_storage_status([ps_uuid], 'Disconnected')

                        if cmd.strategy == 'Permissive':
                            continue

                        killed_vms = kill_vm(cmd.maxAttempts, [mount_path],
                                             True)

                        if len(killed_vms) != 0:
                            self.report_self_fencer_triggered(
                                [ps_uuid], ','.join(killed_vms.keys()))
                            clean_network_config(killed_vms.keys())

                        killed_vm_pids = killed_vms.values()
                        after_kill_vm()

                        if mounted_by_zstack and not linux.is_mounted(
                                mount_path):
                            try_remount_fs()
                            prepare_heartbeat_dir()

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' %
                             heartbeat_file_path)

            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_path, uuid, mounted_by_zstack, url, options in zip(
                cmd.mountPaths, cmd.uuids, cmd.mountedByZStack, cmd.urls,
                cmd.mountOptions):
            if not linux.timeout_isdir(mount_path):
                raise Exception('the mount path[%s] is not a directory' %
                                mount_path)

            heartbeat_file_fencer(mount_path, uuid, mounted_by_zstack, url,
                                  options)

        return jsonobject.dumps(AgentRsp())
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        @thread.AsyncThread
        def heartbeat_file_fencer(mount_path, ps_uuid, mounted_by_zstack):
            def try_remount_fs():
                if mount_path_is_nfs(mount_path):
                    shell.run("systemctl start nfs-client.target")

                while self.run_fencer(ps_uuid, created_time):
                    if linux.is_mounted(path=mount_path) and touch_heartbeat_file():
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug("fs[uuid:%s] is reachable again, report to management" % ps_uuid)
                        break
                    try:
                        logger.debug('fs[uuid:%s] is unreachable, it will be remounted after 180s' % ps_uuid)
                        time.sleep(180)
                        if not self.run_fencer(ps_uuid, created_time):
                            break
                        linux.remount(url, mount_path, options)
                        self.report_storage_status([ps_uuid], 'Connected')
                        logger.debug("remount fs[uuid:%s] success, report to management" % ps_uuid)
                        break
                    except:
                        logger.warn('remount fs[uuid:%s] fail, try again soon' % ps_uuid)
                        kill_progresses_using_mount_path(mount_path)

                logger.debug('stop remount fs[uuid:%s]' % ps_uuid)

            def after_kill_vm():
                if not killed_vm_pids or not mounted_by_zstack:
                    return

                try:
                    kill_and_umount(mount_path, mount_path_is_nfs(mount_path))
                except UmountException:
                    if shell.run('ps -p %s' % ' '.join(killed_vm_pids)) == 0:
                        virsh_list = shell.call("timeout 10 virsh list --all || echo 'cannot obtain virsh list'")
                        logger.debug("virsh_list:\n" + virsh_list)
                        logger.error('kill vm[pids:%s] failed because of unavailable fs[mountPath:%s].'
                                     ' please retry "umount -f %s"' % (killed_vm_pids, mount_path, mount_path))
                        return

                try_remount_fs()

            def touch_heartbeat_file():
                touch = shell.ShellCmd('timeout %s touch %s' % (cmd.storageCheckerTimeout, heartbeat_file_path))
                touch(False)
                if touch.return_code != 0:
                    logger.warn('unable to touch %s, %s %s' % (heartbeat_file_path, touch.stderr, touch.stdout))
                return touch.return_code == 0

            heartbeat_file_path = os.path.join(mount_path, 'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            created_time = time.time()
            self.setup_fencer(ps_uuid, created_time)
            try:
                failure = 0
                url = shell.call("mount | grep -e '%s' | awk '{print $1}'" % mount_path).strip()
                options = shell.call("mount | grep -e '%s' | awk -F '[()]' '{print $2}'" % mount_path).strip()

                while self.run_fencer(ps_uuid, created_time):
                    time.sleep(cmd.interval)
                    if touch_heartbeat_file():
                        failure = 0
                        continue

                    failure += 1
                    if failure == cmd.maxAttempts:
                        logger.warn('failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                                    'shutdown ourselves' % (heartbeat_file_path, cmd.maxAttempts))
                        self.report_storage_status([ps_uuid], 'Disconnected')
                        killed_vms = kill_vm(cmd.maxAttempts, [mount_path], True)

                        if len(killed_vms) != 0:
                            self.report_self_fencer_triggered([ps_uuid], ','.join(killed_vms.keys()))
                        killed_vm_pids = killed_vms.values()
                        after_kill_vm()

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' % heartbeat_file_path)

            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_path, uuid, mounted_by_zstack in zip(cmd.mountPaths, cmd.uuids, cmd.mountedByZStack):
            if not linux.timeout_isdir(mount_path):
                raise Exception('the mount path[%s] is not a directory' % mount_path)

            heartbeat_file_fencer(mount_path, uuid, mounted_by_zstack)

        return jsonobject.dumps(AgentRsp())
Exemple #10
0
    def setup_self_fencer(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])

        self.run_filesystem_fencer = True

        @thread.AsyncThread
        def heartbeat_file_fencer(mount_path, ps_uuid):
            heartbeat_file_path = os.path.join(
                mount_path, 'heartbeat-file-kvm-host-%s.hb' % cmd.hostUuid)
            try:
                failure = 0

                while self.run_filesystem_fencer:
                    time.sleep(cmd.interval)

                    touch = shell.ShellCmd(
                        'timeout %s touch %s; exit $?' %
                        (cmd.storageCheckerTimeout, heartbeat_file_path))
                    touch(False)
                    if touch.return_code == 0:
                        failure = 0
                        continue

                    logger.warn(
                        'unable to touch %s, %s %s' %
                        (heartbeat_file_path, touch.stderr, touch.stdout))
                    failure += 1

                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to touch the heartbeat file[%s] %s times, we lost the connection to the storage,'
                            'shutdown ourselves' %
                            (heartbeat_file_path, cmd.maxAttempts))
                        self.report_storage_status([ps_uuid], 'Disconnected')
                        kill_vm(cmd.maxAttempts, [mount_path], True)
                        break

                logger.debug('stop heartbeat[%s] for filesystem self-fencer' %
                             heartbeat_file_path)
            except:
                content = traceback.format_exc()
                logger.warn(content)

        gateway = cmd.storageGateway
        if not gateway:
            gateway = linux.get_gateway_by_default_route()

        @thread.AsyncThread
        def storage_gateway_fencer(gw):
            failure = 0

            try:
                while self.run_filesystem_fencer:
                    time.sleep(cmd.interval)

                    ping = shell.ShellCmd(
                        "nmap -sP -PI %s | grep 'Host is up'" % gw)
                    ping(False)
                    if ping.return_code == 0:
                        failure = 0
                        continue

                    logger.warn(
                        'unable to ping the storage gateway[%s], %s %s' %
                        (gw, ping.stderr, ping.stdout))
                    failure += 1

                    if failure == cmd.maxAttempts:
                        logger.warn(
                            'failed to ping storage gateway[%s] %s times, we lost connection to the storage,'
                            'shutdown ourselves' % (gw, cmd.maxAttempts))
                        # there might be a race condition with very small probabilities because report action is async,
                        # cause vm ha slow because starting vm might select this broken host
                        # due to ps-report arriving later than vm-report.
                        # a sure card is to modify it to sync action with try-except
                        # but if http post fail to retry too many times, kill vm will be too late.
                        # so for ha efficiency, that's it..
                        self.report_storage_status(cmd.psUuids, 'Disconnected')
                        kill_vm(cmd.maxAttempts, cmd.mountPoints, True)

                logger.debug(
                    'stop gateway[%s] fencer for filesystem self-fencer' % gw)
            except:
                content = traceback.format_exc()
                logger.warn(content)

        for mount_point, uuid in zip(cmd.mountPoints, cmd.uuids):
            if not linux.timeout_isdir(mount_point):
                raise Exception('the mount point[%s] is not a directory' %
                                mount_point)

            heartbeat_file_fencer(mount_point, uuid)

        if gateway:
            storage_gateway_fencer(gateway)
        else:
            logger.warn(
                'cannot find storage gateway, unable to setup storage gateway fencer'
            )

        return jsonobject.dumps(AgentRsp())