def delete_lv(self, vg_name, lv_name): # 首先判断vg以及lv是否存在 vgs = self.get_vgs() for vg in vgs: if vg_name == vg['name']: for lv in vg['lvs']: # 判断lv是否存在 if lv_name == lv['name']: break else: raise exception.LVNotExists(lv=lv_name) break else: logging.error("the vg %s is not exist", vg_name) raise exception.VGNotExists(vg=vg_name) try: cmdutils.execute("umount", "/dev/%s/%s" % (vg_name, lv_name), run_as_root=True, ignore_exit_code=True) out, err = cmdutils.execute("lvremove", "-f", "/dev/%s/%s" % (vg_name, lv_name), run_as_root=True, ignore_exit_code=True) if err: logging.error("lvremove failed:%s", err) raise exception.LVRemoveError(lv=lv_name, error=err) cmdutils.run_cmd("sed -i '/%s-%s/d' /etc/fstab" % (vg_name, lv_name)) logging.info("delete lv %s end", lv_name) except Exception as e: logging.error("delete lv failed:%s", e) raise e
def _clear_backup(self): logging.info("start _clear_backup") try: for filename in os.listdir(self.backup_dir): os.remove(os.path.join(self.backup_dir, filename)) except Exception as e: logging.exception('_clear_backup error: %s ' % str(e)) cmdutils.run_cmd("rm -f %s" % self.backup_dir) os.mkdir(self.backup_dir) logging.info("_clear_backup finished")
def set_ntp(self, server): chronyd_conf = [ "driftfile /var/lib/chrony/drift", "makestep 1.0 3", "rtcsync", "logdir /var/log/chrony", "server %s iburst" % server ] FileOp(constants.CHRONYD_CONF, 'w').write_with_endline('\n'.join(chronyd_conf)) logging.info("config ntp end:%s", chronyd_conf) cmdutils.run_cmd("timedatectl set-ntp yes")
def get_host_and_peer_ip(ha_info_obj): """ 获取主控节点(VIP所在节点)的IP、备控节点的IP和uuid。 通过此方法能够找出当前真实的主控节点,避免数据库中节点角色可能更新不及时导致的错误。 :param ha_info_obj: :return: """ try: ips = (ha_info_obj.master_ip, ha_info_obj.backup_ip) # 确定peer(无VIP节点)的ip和node_uuid # 由于前端调VIP的yzy-web,而yzy-web只会调本地的yzy-server,则本节点一定是vip_host code, out = cmdutils.run_cmd("ip addr |grep {ip}".format(ip=ips[0]), ignore_log=True) if code == 0 and ips[0] in out: vip_host_ip = ips[0] peer_host_ip = ips[1] peer_uuid = ha_info_obj.backup_uuid else: vip_host_ip = ips[1] peer_host_ip = ips[0] peer_uuid = ha_info_obj.master_uuid # logger.info("vip_host_ip: %s, peer_host_ip: %s" % (vip_host_ip, peer_host_ip)) return vip_host_ip, peer_host_ip, peer_uuid except Exception as e: logger.exception("%s", str(e), exc_info=True)
def _check_vip(self, interval, vip): for i in range(3 * interval): time.sleep(3) code, out = cmdutils.run_cmd("ip addr |grep {vip}".format(vip=vip), ignore_log=True) if code == 0 and out: return True raise EnableHaException("check vip timeout")
def _run_cmd(self, cmd_str): code, out = cmdutils.run_cmd(cmd_str) if code != 0: if "already uses address" in out: mac = re.search(self.mac_regex, out).group(0) ip = re.search(self.ip_regex, out).group(0) return get_error_result("IPUsedByOtherHost", mac=mac, ip=ip) raise exception.BondException(error=out) return None
def check_backup_ha_status(self, quorum_ip, sensitivity, paths): try: code, out = cmdutils.run_cmd("systemctl status keepalived", ignore_log=True) if code != 0 or "active (running)" not in out: keepalived_status = constants.HA_STATUS_FAULT logging.error("keepalived not running") else: keepalived_status = constants.HA_STATUS_NORMAL if quorum_ip: if not icmp_ping(quorum_ip, timeout=1, count=sensitivity): quorum_ip_status = constants.HA_STATUS_FAULT logging.error("ping quorum_ip[%s] failed" % quorum_ip) else: quorum_ip_status = constants.HA_STATUS_NORMAL else: quorum_ip_status = constants.HA_STATUS_UNKNOWN code, out = cmdutils.run_cmd( "mysql -u{user} -p{pwd} -e \"SHOW SLAVE STATUS\G;\" |grep \"Error \"" .format(user=self.db_user, pwd=self.db_pwd), ignore_log=True) if out: mysql_slave_status = constants.HA_STATUS_FAULT logging.error("mysql slave status error: %s", out) else: mysql_slave_status = constants.HA_STATUS_NORMAL file_sync_status = constants.HA_STATUS_NORMAL for path in paths: if not os.path.exists(path): file_sync_status = constants.HA_STATUS_FAULT break ret = get_error_result("Success", data=[ keepalived_status, quorum_ip_status, mysql_slave_status, file_sync_status ]) except Exception as e: logging.exception(str(e), exc_info=True) ret = get_error_result("OtherError") return ret
def umount_nfs(self, name): out, err = cmdutils.execute("mount", "-v", run_as_root=True, ignore_exit_code=True) if constants.NFS_MOUNT_POINT_PREFIX + name not in out: logging.info("the device is not in mount status, go on") else: out, err = cmdutils.execute( "umount", "{}".format(constants.NFS_MOUNT_POINT_PREFIX + name), run_as_root=True, ignore_exit_code=True) if err: logging.error("umount nfs error:%s", err) else: # 修改开机自动挂载 cmdutils.run_cmd("sed -i '/{}/d' /etc/fstab".format('nfs_' + name)) logging.info("umount {}, out:{} , err:{}".format( constants.NFS_MOUNT_POINT_PREFIX + name, out, err))
def _rollback(self, bond_name, new_flag=True): logging.info("start _rollback") if new_flag: # 新增bond回滚需要删除bond的master身份和ifcfg文件 cmdutils.run_cmd("echo -%s > %s" % (bond_name, constants.BOND_MASTERS)) try: os.remove(self.file_path % bond_name) logging.info("remove file: %s", self.file_path % bond_name) except Exception as e: logging.exception('_rollback error: %s ' % str(e)) for filename in os.listdir(self.backup_dir): # 新增bond时不会备份bond的ifcfg文件,所以还原时自然也不应该存在bond的ifcfg文件,万一存在了,也不要还原它 # 编辑、删除bond时需要还原bond的ifcfg文件 if new_flag and filename == bond_name: continue try: shutil.copy2(os.path.join(self.backup_dir, filename), self.file_path % filename) cmdutils.run_cmd("ifdown %s" % filename) cmdutils.run_cmd("ifup %s" % filename) except Exception as e: logging.exception('_rollback error: %s ' % str(e)) continue logging.info("_rollback finished") self._clear_backup()
def config_ntp(self, ipaddr, netmask): is_mask, bits = utils.is_netmask(netmask) if not is_mask: bits = 24 net = netaddr.IPNetwork(str(ipaddr) + '/' + str(bits)) cidr = str(net.network) + '/' + str(net.prefixlen) chronyd_conf = [ "server ntp1.aliyun.com", "server ntp2.aliyun.com", "server cn.ntp.org.cn", "server cn.pool.ntp.org", "driftfile /var/lib/chrony/drift", "makestep 1.0 3", "rtcsync", "allow %s" % cidr, "local stratum 10", "logdir /var/log/chrony" ] FileOp(constants.CHRONYD_CONF, 'w').write_with_endline('\n'.join(chronyd_conf)) logging.info("config ntp server:%s", chronyd_conf) cmdutils.run_cmd("firewall-cmd --add-service=ntp --permanent") cmdutils.run_cmd("firewall-cmd --reload") cmdutils.run_cmd("timedatectl set-ntp yes")
def add_ip_info(self, data): """ { "name": "eth0", "ip_infos"[ { "ip": "172.16.1.31", "netmask": "255.255.255.0" }, ... ], "gate_info": { "gateway": "172.16.1.254", "dns1": "8.8.8.8", "dns2": "114.114.114.114" }, "net_info": { "network_id": "", "physical_interface": "" } } :return: """ try: nic_name = data.get("name") virtual_net_device = os.listdir('/sys/devices/virtual/net/') nic_addrs = psutil.net_if_addrs() physical_net_device = [ dev for dev in nic_addrs.keys() if dev not in virtual_net_device ] if nic_name.split(':')[0] not in physical_net_device: logging.error("add nic %s ip, not physical nic", nic_name) return get_error_result("NotPhysicalNICError") resp = dict() resp['data'] = {} utc = int((dt.datetime.utcnow() - dt.datetime.utcfromtimestamp(0)).total_seconds()) resp['data']['utc'] = utc nic_ifcfg = "/etc/sysconfig/network-scripts/ifcfg-%s" % nic_name nic_content = [ "NAME=%s" % nic_name, "DEVICE=%s" % nic_name, "TYPE=Ethernet", "ONBOOT=yes", # "DEFROUTE=no", "NM_CONTROLLED=no", "BOOTPROTO=%s" % ("static" if data.get('ip_infos') else "none") ] # with open(nic_ifcfg, 'r') as fd: # lines = fd.readlines() # # IP信息以及需要修改的部分不继承,其余的继承原有配置 # for line in lines: # if not line.strip(): # continue # if line.startswith("IPADDR") or line.startswith("NETMASK") \ # or line.startswith("GATEWAY") or line.startswith("DNS"): # continue # key = line.split('=')[0] # if key in ['NAME', 'DEVICE', 'TYPE', 'ONBOOT', 'BOOTPROTO', 'NM_CONTROLLED', 'DEFROUTE']: # continue # nic_content.append(line.strip()) logging.info("the nic content:%s", nic_content) # 更新IP信息 for index, info in enumerate(data['ip_infos']): nic_content.append("IPADDR%s=%s" % (index, info['ip'])) nic_content.append("NETMASK%s=%s" % (index, info['netmask'])) if data.get('gate_info'): if data['gate_info'].get('gateway'): nic_content.append("GATEWAY=%s" % data['gate_info']['gateway']) if data['gate_info'].get('dns1'): nic_content.append("DNS1=%s" % data['gate_info']['dns1']) if data['gate_info'].get('dns2'): nic_content.append("DNS2=%s" % data['gate_info']['dns2']) self.update_conf(nic_ifcfg, nic_content) # 如果是flat网络,则需要将网卡信息配置到网桥上 net_info = data.get('net_info') if net_info and LinuxBridgeManager().check_bridge_exist( net_info['network_id']): LinuxBridgeManager().add_addition_ip(net_info['network_id'], data['ip_infos'], data['gate_info']) else: cmdutils.run_cmd("ifdown %s" % nic_name) cmdutils.run_cmd("ifup %s" % nic_name) logging.info("set nic %s ip success" % nic_name) resp["data"] = {"name": nic_name} return resp except Exception as e: logging.exception(e) raise e
def update_ha_master(): is_vip, is_master, is_backup = False, False, False current_ip = None ha_info_obj = db_api.get_ha_info_first() if ha_info_obj: # logger.info("ha_info before monitor update: %s" % ha_info_obj.dict()) # 获取本机所有启用网口的ip,查看本节点的ip在ha_info表中是master_ip还是backup_ip code, out = cmdutils.run_cmd( """ip -br a |grep ' UP ' |grep -o '[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*'""", ignore_log=True) if code != 0: logger.error(out) else: ip_list = [ip for ip in out.strip('\n').split('\n')] for ip in ip_list: if ip == ha_info_obj.vip: is_vip = True elif ip == ha_info_obj.master_ip: is_master = True elif ip == ha_info_obj.backup_ip: is_backup = True if not is_vip: logger.error("server running without vip[%s]" % ha_info_obj.vip) else: if not is_master and not is_backup: logger.error( "server running without master_ip[%s], backup_ip[%s]" % (ha_info_obj.master_ip, ha_info_obj.backup_ip)) elif is_master and is_backup: logger.error( "server running with both master_ip[%s], backup_ip[%s]" % (ha_info_obj.master_ip, ha_info_obj.backup_ip)) elif is_master and not is_backup: current_ip = ha_info_obj.master_ip elif not is_master and is_backup: # 如果发现本节点的ip在ha_info表中是backup_ip,说明notify.sh脚本中调用server服务/node/master接口去更新数据库的操作失败了 # 检查并修正ha_info表中的ip current_ip = ha_info_obj.backup_ip ha_info_obj.master_ip, ha_info_obj.backup_ip = ha_info_obj.backup_ip, ha_info_obj.master_ip logger.info("update ha_info[%s] master_ip from %s to %s", (ha_info_obj.uuid, ha_info_obj.backup_ip, ha_info_obj.master_ip)) if current_ip: # current_ip所在节点应该为master,检查并修正ha_info表中node_uuid、nic、nic_uuid current_node_obj = db_api.get_node_by_ip(current_ip) if current_node_obj.uuid == ha_info_obj.backup_uuid: ha_info_obj.master_uuid, ha_info_obj.backup_uuid = ha_info_obj.backup_uuid, ha_info_obj.master_uuid logger.info( "update ha_info[%s] master_uuid from %s to %s", (ha_info_obj.uuid, ha_info_obj.backup_uuid, ha_info_obj.master_uuid)) current_ip_obj = db_api.get_nic_ip_by_ip(current_ip) if current_ip_obj.nic_uuid == ha_info_obj.backup_nic_uuid: ha_info_obj.master_nic_uuid, ha_info_obj.backup_nic_uuid = ha_info_obj.backup_nic_uuid, ha_info_obj.master_nic_uuid logger.info( "update ha_info[%s] master_nic_uuid from %s to %s", (ha_info_obj.uuid, ha_info_obj.backup_nic_uuid, ha_info_obj.master_nic_uuid)) if ha_info_obj.master_nic != ha_info_obj.backup_nic and current_ip_obj.name == ha_info_obj.backup_nic: ha_info_obj.master_nic, ha_info_obj.backup_nic = ha_info_obj.backup_nic, ha_info_obj.master_nic logger.info( "update ha_info[%s] master_nic from %s to %s", (ha_info_obj.uuid, ha_info_obj.backup_nic, ha_info_obj.master_nic)) ha_info_obj.soft_update() # 检查并修正backup_uuid节点的type real_backup_node_obj = db_api.get_node_by_uuid( ha_info_obj.backup_uuid) if real_backup_node_obj.type not in [ constants.ROLE_SLAVE_AND_COMPUTE, constants.ROLE_SLAVE ]: wrong_type = real_backup_node_obj.type if real_backup_node_obj.type in [ constants.ROLE_MASTER_AND_COMPUTE, constants.ROLE_COMPUTE ]: real_backup_node_obj.type = constants.ROLE_SLAVE_AND_COMPUTE else: real_backup_node_obj.type = constants.ROLE_SLAVE real_backup_node_obj.soft_update() logger.info( "update real_backup_node[%s] role from %s to %s", real_backup_node_obj.ip, wrong_type, real_backup_node_obj.type) # 检查并修正master_uuid节点的type if current_node_obj.type not in [ constants.ROLE_MASTER, constants.ROLE_MASTER_AND_COMPUTE ]: wrong_type = current_node_obj.type if wrong_type in [ constants.ROLE_SLAVE_AND_COMPUTE, constants.ROLE_COMPUTE ]: current_node_obj.type = constants.ROLE_MASTER_AND_COMPUTE else: current_node_obj.type = constants.ROLE_MASTER current_node_obj.soft_update() logger.info( "update current_node[%s] role from %s to %s", current_node_obj.ip, wrong_type, current_node_obj.type) # 检查并修正yzy_template、yzy_voi_template表的模板宿主机uuid templates = db_api.get_template_with_all({}) for template in templates: if constants.SYSTEM_DESKTOP == template.classify: continue if template.host_uuid != current_node_obj.uuid: template.host_uuid = current_node_obj.uuid template.soft_update() logger.info("update template %s host_uuid to %s", template.name, current_node_obj.uuid) voi_templates = db_api.get_voi_template_with_all({}) for template in voi_templates: if constants.SYSTEM_DESKTOP == template.classify: continue if template.host_uuid != current_node_obj.uuid: template.host_uuid = current_node_obj.uuid template.soft_update() logger.info( "update voi template %s host_uuid to %s", template.name, current_node_obj.uuid)
def create_lv(self, vg_name, lv_name, size): # 首先判断vg以及lv是否存在 vgs = self.get_vgs() for vg in vgs: if vg_name == vg['name']: # 判断卷组是否有空间 if vg['free_size'] <= 0 or (size > 0 and vg['free_size'] < size): logging.error("the vg has no enough size, lvextend failed") raise exception.VGNoEnoughSize(vg=vg_name) break else: logging.error("the vg %s is not exist", vg_name) raise exception.VGNotExists(vg=vg_name) for vg in vgs: for lv in vg['lvs']: if lv_name == lv['name']: raise exception.LVAlreadyExists(lv=lv_name) try: # size小于0,分配vg的所有剩余空间给lv if size < 0: out, err = cmdutils.execute("lvcreate", "-n", lv_name, "-l", "100%FREE", vg_name, run_as_root=True, ignore_exit_code=True) else: out, err = cmdutils.execute("lvcreate", "-n", lv_name, "-L", "%sG" % size, vg_name, run_as_root=True, ignore_exit_code=True) if err: logging.error("lvcreate failed:%s", err) raise exception.LVCreateError(lv=lv_name, error=err) logging.info("create lv end, next is format the file system") # 默认格式化为ext4 out, err = cmdutils.execute("mkfs.ext4", "/dev/%s/%s" % (vg_name, lv_name), run_as_root=True, ignore_exit_code=True) if err: logging.error("lv format file system error:%s", err) raise exception.LVSyncFormatFailed(lv=lv_name) mount_point = os.path.join(constants.LV_PATH_PREFIX, lv_name) if not os.path.exists(mount_point): os.makedirs(mount_point) out, err = cmdutils.execute("mount", "/dev/%s/%s" % (vg_name, lv_name), mount_point, run_as_root=True, ignore_exit_code=True) if err: logging.error("mount lv error:%s", err) raise exception.LVMountError(lv=lv_name, mount_point=mount_point, error=err) cmdutils.run_cmd( "echo /dev/mapper/%s-%s %s ext4 defaults 1 2 >> /etc/fstab" % (vg_name, lv_name, mount_point)) logging.info("lvcreate success, vg:%s, lv:%s, size:%s", vg_name, lv_name, size) return {"mount_point": mount_point} except Exception as e: logging.error("create lv failed:%s", e) cmdutils.execute("lvremove", "-f", vg_name, lv_name, run_as_root=True, ignore_exit_code=True) raise e
def _write_fstab(self, nfs_server, name): cmdutils.run_cmd("sed -i '/{}/d' /etc/fstab".format('nfs_' + name)) cmdutils.run_cmd( "echo {} {} nfs defaults,nosuid,noexec,nodev,noatime,nodiratime," "vers=3,rsize=1048576,wsize=1048576 1 2 >> /etc/fstab".format( nfs_server, constants.NFS_MOUNT_POINT_PREFIX + name))
def _run_cmd(self, cmd_str, exception=None): code, out = cmdutils.run_cmd(cmd_str) if exception and code != 0: raise exception(error=out) return code, out
def config_backup(self, vip, netmask, sensitivity, quorum_ip, master_ip, backup_ip, backup_nic): try: # 1、在/etc/my.cnf.d/mariadb-server.cnf的[mysqld]区域增加7个参数** self._update_conf(self.mysql_cnf, self.backup_content, keyword="[mysqld]\n") # 2、先清空备控节点的所有mysql数据,然后启动mariadb服务 self.del_file(self.mysql_data_path) self._run_cmd("systemctl enable --now mariadb", EnableHaException) # 3、设置mysql的root账户密码(从未做过HA的计算节点,mysql无密码;已做过HA的计算节点,mysql有密码) try: code, out = cmdutils.run_cmd( "mysql -u{user} -e \"ALTER USER 'root'@'localhost' IDENTIFIED BY '{pwd}';\"" .format(user=self.db_user, pwd=self.db_pwd)) except Exception: pass # 4、设置主从复制账户 self._run_cmd( "mysql -u{user} -p{pwd} -e \"GRANT REPLICATION SLAVE ON *.* TO 'replicater'@'%' IDENTIFIED BY '{pwd}';" "flush privileges;\"".format(user=self.db_user, pwd=self.db_pwd), EnableHaException) # 6、导入数据,其中包含了配置主从复制的语句 self._run_cmd( "mysql -u{user} -p{pwd} < {file}".format( user=self.db_user, pwd=self.db_pwd, file=self.db_dump_file), EnableHaException) time.sleep(2) code, out = self._run_cmd( "mysql -u{user} -p{pwd} -e \"show master status;\"".format( user=self.db_user, pwd=self.db_pwd), EnableHaException) log_file = out.split('\t')[-4].split('\n')[-1] log_pos = out.split('\t')[-3] self._remove_file(self.db_dump_file) self._run_cmd( "mysql -u{user} -p{pwd} -e \"start slave;\"".format( user=self.db_user, pwd=self.db_pwd), EnableHaException) # 临时方案,开启slave后给一定时间进行同步,后期可以改成获取同步状态 time.sleep(3) # 8、配置/etc/keepalived/keepalived.conf文件和check.sh脚本(仲裁IP,敏感度)、notify.sh脚本 backup_content = [ self.keep_content[0] % (self.check_brain_file, sensitivity + 2, backup_nic, backup_ip, master_ip, 90), self.keep_content[1] % (self.notify_sh_file, self.notify_sh_file, self.notify_sh_file, self.notify_sh_file, '%s/%s' % (vip, self._exchange_mask(netmask)), backup_nic) ] self._update_conf(self.keep_cnf, backup_content) self._update_conf( self.check_brain_file, [self.check_brain_content[0] % (sensitivity, quorum_ip)]) self._update_conf( self.notify_sh_file, [self.notify_sh_content[0] % (self.db_pwd, self.flag_file)]) ret = get_error_result(data={ "log_file": log_file, "log_pos": log_pos }) except Exception as e: logging.exception(str(e), exc_info=True) ret = get_error_result("ConfigBackupHAError") return ret