def connect(self, req): cmd = jsonobject.loads(req[http.REQUEST_BODY]) rsp = ConnectRsp() diskPaths = set() def config_lvm(host_id, enableLvmetad=False): lvm.backup_lvm_config() lvm.reset_lvm_conf_default() lvm.config_lvm_by_sed("use_lvmlockd", "use_lvmlockd=1", ["lvm.conf", "lvmlocal.conf"]) if enableLvmetad: lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=1", ["lvm.conf", "lvmlocal.conf"]) else: lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=0", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("host_id", "host_id=%s" % host_id, ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("sanlock_lv_extend", "sanlock_lv_extend=%s" % DEFAULT_SANLOCK_LV_SIZE, ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("lvmlockd_lock_retries", "lvmlockd_lock_retries=6", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("issue_discards", "issue_discards=1", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("reserved_stack", "reserved_stack=256", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("reserved_memory", "reserved_memory=131072", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_filter(["lvm.conf", "lvmlocal.conf"]) lvm.config_sanlock_by_sed("sh_retries", "sh_retries=20") lvm.config_sanlock_by_sed("logfile_priority", "logfile_priority=7") lvm.config_sanlock_by_sed("renewal_read_extend_sec", "renewal_read_extend_sec=24") lvm.config_sanlock_by_sed("debug_renew", "debug_renew=1") lvm.config_sanlock_by_sed("use_watchdog", "use_watchdog=0") sanlock_hostname = "%s-%s-%s" % (cmd.vgUuid[:8], cmd.hostUuid[:8], bash.bash_o("hostname").strip()[:20]) lvm.config_sanlock_by_sed("our_host_name", "our_host_name=%s" % sanlock_hostname) config_lvm(cmd.hostId, cmd.enableLvmetad) for diskUuid in cmd.sharedBlockUuids: disk = CheckDisk(diskUuid) diskPaths.add(disk.get_path()) lvm.start_lvmlockd() lvm.check_gl_lock() logger.debug("find/create vg %s lock..." % cmd.vgUuid) rsp.isFirst = self.create_vg_if_not_found(cmd.vgUuid, diskPaths, cmd.hostUuid, cmd.forceWipe) lvm.check_stuck_vglk() logger.debug("starting vg %s lock..." % cmd.vgUuid) lvm.start_vg_lock(cmd.vgUuid) if lvm.lvm_vgck(cmd.vgUuid, 60)[0] is False and lvm.lvm_check_operation(cmd.vgUuid) is False: lvm.drop_vg_lock(cmd.vgUuid) logger.debug("restarting vg %s lock..." % cmd.vgUuid) lvm.check_gl_lock() lvm.start_vg_lock(cmd.vgUuid) lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG) lvm.add_vg_tag(cmd.vgUuid, "%s::%s::%s::%s" % (HEARTBEAT_TAG, cmd.hostUuid, time.time(), bash.bash_o('hostname').strip())) self.clear_stalled_qmp_socket() rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid) rsp.hostId = lvm.get_running_host_id(cmd.vgUuid) rsp.vgLvmUuid = lvm.get_vg_lvm_uuid(cmd.vgUuid) rsp.hostUuid = cmd.hostUuid return jsonobject.dumps(rsp)
def connect(self, req): cmd = jsonobject.loads(req[http.REQUEST_BODY]) rsp = ConnectRsp() diskPaths = set() def config_lvm(host_id): lvm.backup_lvm_config() lvm.reset_lvm_conf_default() lvm.config_lvm_by_sed("use_lvmlockd", "use_lvmlockd=1", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("use_lvmetad", "use_lvmetad=0", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("host_id", "host_id=%s" % host_id, ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("sanlock_lv_extend", "sanlock_lv_extend=%s" % DEFAULT_SANLOCK_LV_SIZE, ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("lvmlockd_lock_retries", "lvmlockd_lock_retries=6", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("issue_discards", "issue_discards=1", ["lvm.conf", "lvmlocal.conf"]) lvm.config_lvm_by_sed("reserved_memory", "reserved_memory=65536", ["lvm.conf", "lvmlocal.conf"]) lvm.config_sanlock_by_sed("sh_retries", "sh_retries=20") lvm.config_sanlock_by_sed("logfile_priority", "logfile_priority=7") lvm.config_sanlock_by_sed("renewal_read_extend_sec", "renewal_read_extend_sec=24") lvm.config_sanlock_by_sed("debug_renew", "debug_renew=1") lvm.config_sanlock_by_sed("use_watchdog", "use_watchdog=0") sanlock_hostname = "%s-%s-%s" % (cmd.vgUuid[:8], cmd.hostUuid[:8], bash.bash_o("hostname").strip()[:20]) lvm.config_sanlock_by_sed("our_host_name", "our_host_name=%s" % sanlock_hostname) config_lvm(cmd.hostId) for diskUuid in cmd.sharedBlockUuids: disk = CheckDisk(diskUuid) diskPaths.add(disk.get_path()) lvm.start_lvmlockd() lvm.check_gl_lock() logger.debug("find/create vg %s lock..." % cmd.vgUuid) rsp.isFirst = self.create_vg_if_not_found(cmd.vgUuid, diskPaths, cmd.hostUuid, cmd.forceWipe) lvm.check_stuck_vglk() logger.debug("starting vg %s lock..." % cmd.vgUuid) lvm.start_vg_lock(cmd.vgUuid) if lvm.lvm_vgck(cmd.vgUuid, 15)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) logger.debug("restarting vg %s lock..." % cmd.vgUuid) lvm.check_gl_lock() lvm.start_vg_lock(cmd.vgUuid) lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG) lvm.add_vg_tag(cmd.vgUuid, "%s::%s::%s::%s" % (HEARTBEAT_TAG, cmd.hostUuid, time.time(), bash.bash_o('hostname').strip())) rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid) rsp.hostId = lvm.get_running_host_id(cmd.vgUuid) return jsonobject.dumps(rsp)
def heartbeat_on_sharedblock(): fire = 0 failure = 0 while self.run_fencer(cmd.vgUuid, created_time): try: time.sleep(cmd.interval) global last_multipath_run if cmd.fail_if_no_path and time.time( ) - last_multipath_run > 3600: last_multipath_run = time.time() thread.ThreadFacade.run_in_thread( linux.set_fail_if_no_path) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug( "sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: fire = 0 failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue if self.fencer_fire_timestamp.get(cmd.vgUuid) is not None and \ time.time() > self.fencer_fire_timestamp.get(cmd.vgUuid) and \ time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid) < (300 * (fire + 1 if fire < 10 else 10)): logger.warn( "last fencer fire: %s, now: %s, passed: %s seconds, within %s seconds, skip fire", self.fencer_fire_timestamp[cmd.vgUuid], time.time(), time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid), 300 * (fire + 1 if fire < 10 else 10)) failure = 0 continue self.fencer_fire_timestamp[cmd.vgUuid] = time.time() try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) fire += 1 if cmd.strategy == 'Permissive': continue # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids( cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv( cmd.vgUuid, invalid_pv_uuids, True) killed_vm_uuids = [] for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) killed_vm_uuids.append(vm.uuid) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug( "deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug( "volume %s still used: %s, skip to deactivate" % (volume, used_process)) if len(killed_vm_uuids) != 0: self.report_self_fencer_triggered( [cmd.vgUuid], ','.join(killed_vm_uuids)) clean_network_config(killed_vm_uuids) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) finally: failure = 0 except Exception as e: logger.debug( 'self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) if not self.run_fencer(cmd.vgUuid, created_time): logger.debug( 'stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid) else: logger.warn( 'stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)
def heartbeat_on_sharedblock(): failure = 0 while self.run_sharedblock_fencer[cmd.vgUuid] is True: try: time.sleep(cmd.interval) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug( "sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids( cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv( cmd.vgUuid, invalid_pv_uuids, cmd.checkIo) for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug( "deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug( "volume %s still used: %s, skip to deactivate" % (volume, used_process)) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) # reset the failure count failure = 0 except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) except Exception as e: logger.debug( 'self-fencer on sharedblock primary storage %s stopped abnormally' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) logger.debug('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)
def heartbeat_on_sharedblock(): failure = 0 while self.run_fencer(cmd.vgUuid, created_time): try: time.sleep(cmd.interval) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug("sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids(cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv(cmd.vgUuid, invalid_pv_uuids, True) killed_vm_uuids = [] for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) killed_vm_uuids.append(vm.uuid) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug("deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug("volume %s still used: %s, skip to deactivate" % (volume, used_process)) if len(killed_vm_uuids) != 0: self.report_self_fencer_triggered([cmd.vgUuid], ','.join(killed_vm_uuids)) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) # reset the failure count failure = 0 except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) except Exception as e: logger.debug('self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) if not self.run_fencer(cmd.vgUuid, created_time): logger.debug('stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid) else: logger.warn('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)