class Instence(object): def __init__(self, role, i, config=None): """ 初始化中,不能创建cds/idx 或 mond/idx 目录 """ self.config = config if self.config is None: self.config = Config() self.role = role self.service = int(i) self.cmd = None self.home = None if self.role == "mond": self.cmd = self.config.uss_mond self.home = os.path.join(self.config.home, "data/%s/%s" % (self.role, self.service)) os.system("touch %s/fake" % (self.home)) elif self.role == "cds": self.cmd = self.config.uss_cds self.home = os.path.join(self.config.home, "data/%s/%s" % (self.role, self.service)) elif self.role == "nfs": os.system("mkdir -p %s/data/nfs/0" % (self.config.home)) self.cmd = self.config.uss_ynfs self.home = os.path.join(self.config.home, "data/nfs/0") elif self.role == "ftp": os.system("mkdir -p %s/data/ftp/0" % (self.config.home)) self.cmd = self.config.uss_ftp self.home = os.path.join(self.config.home, "data/ftp/0") elif self.role == "redis": os.system("mkdir -p %s/data/redis" % (self.config.home)) self.cmd = None self.home = os.path.join(self.config.home, "data/%s/%s" % (self.role, self.service)) #print [self.home, self.cmd, self.role] self.name = self.home self.disk_status = 0; self.pid = -1 self.ppid = -1 self.deleting = False self.deleted = False self.skiped = False self.nomount = False self.nid = None try: self.nid = get_value(self.home + "/status/nid").strip() except Exception, e: pass try: tmp = self.home + '/check_' + str(random.random()) set_value(tmp, "test") os.unlink(tmp) except Exception, e: derror(e) self.disk_status = errno.EIO;
def __redis_start(self): cmd = "redis-server %s/config/redis.conf" % (self.workdir) pidfile = os.path.join(self.workdir, 'run/redis-server.pid') dmsg("start: " + cmd) retry = 0; while (1): os.system(cmd) if os.path.exists(pidfile): try: self.redis_pid = int(get_value(pidfile)) break; except ValueError: derror("get pid, value error") time.sleep(0.1) else: dwarn("start %s fail, retry %u\n" % (self.name, retry)) time.sleep(0.2) retry = retry + 1
def _cacheget(self, key, cachedev_name): if key == 'all': keys = ['cache_mode', 'writeback_percent', 'sequential_cutoff', 'dirty_data'] else: keys = [key] buf = StringIO() buf.write('%-10s' % cachedev_name) for key in keys: path = "/sys/block/%s/bcache/%s" % (cachedev_name, key) if not os.path.isfile(path): derror('key: %s not found in %s' % (key, cachedev_name)) continue value = _get_value(path) value = value.strip(' \n') buf.write(' - %s: %s' % (key, value)) dmsg(buf.getvalue())
def vipdel(self, group, host, vip): if host == None and vip == None: derror("vip and host can not be null at the same time.") sys.exit(1) cmd = "python2 %s del -g %s" % (self.config.uss_vip, group) if vip is not None: cmd = cmd + " -v %s" % (vip) if host is not None: cmd = cmd + " -H %s" % (host) #print 'cmd',cmd def _warp(h): x, y = exec_remote(h, cmd) #print "stat host: %s \n%s" % (h, x) if y: print y args = [[x] for x in self.config.cluster.keys()] mutil_exec(_warp, args)
class Instence(object): def __init__(self, role, i, config=None): """ 初始化中,不能创建bactl/idx 或 mdctl/idx 目录 """ self.config = config if self.config is None: self.config = Config() self.role = role self.service = int(i) self.cmd = None self.home = None if self.role == "mdctl": self.cmd = self.config.uss_mdctl self.home = os.path.join(self.config.home, "data/%s" % (self.role)) elif self.role == "bactl": self.cmd = self.config.uss_bactl self.home = os.path.join(self.config.home, "data/%s" % (self.role)) elif self.role == "frctl": os.system("mkdir -p %s/data/frctl" % (self.config.home)) self.cmd = self.config.uss_yfrctl self.home = os.path.join(self.config.home, "data/frctl") #print [self.home, self.cmd, self.role] self.name = self.home self.disk_status = 0 self.pid = -1 self.ppid = -1 self.deleting = False self.deleted = False self.skiped = False self.nomount = False self.nid = None try: self.nid = get_value(self.home + "/status/nid").strip() except Exception, e: pass try: tmp = self.home + '/check_' + str(random.random()) set_value(tmp, "test") os.unlink(tmp) except Exception, e: derror(e) self.disk_status = errno.EIO
def nfs_installed(): nfs_bin = '/usr/bin/ganesha.nfsd' try: os.stat(nfs_bin) res = 0 except Exception as e: res = e.errno if res != errno.ENOENT: derror('stat %s failed' % nfs_bin) return False if res == errno.ENOENT: derror('ganesha not found') return False _exec = '%s -v' % nfs_bin try: out, _ = exec_shell(_exec, need_return=True) except Exp, e: derror('errno:%d, error:%s' % e.errno, str(e)) return False
def fail_exit(msg): derror(msg) os.system('for pid in `ps -ef | grep test_list.py | grep -v grep | cut -c 9-15`; do kill -9 $pid; done') os.system('for pid in `ps -ef | grep test.py | grep -v grep | cut -c 9-15`; do kill -9 $pid; done') os.system('for pid in `ps -ef | grep "health" | grep -v grep | cut -c 9-15`; do kill -9 $pid; done') os.system('kill -9 ' + str(os.getpid()))
#os.system("kill -USR2 %u" % self.pid) dmsg("stop %s %s,%s" % (self.home, self.pid, ttyonly)) #temporary solution for bug #2096 os.system("kill -USR2 %u" % (self.ppid)) os.system("kill -USR2 %u" % (self.pid)) #os.system("kill -9 %u" % self.pid) time.sleep(0.1) if (self.running()): time.sleep(1.5) dwarn("%s, still running, sleep 1" % (self.name), ttyonly) time.sleep(1) if (self.running()): derror("stop %s pid /%u" % (self.name, self.pid), ttyonly) os.system("kill -9 %u" % (self.pid)) i = 0 max_retry = 1000 while True: if (i > max_retry): derror("stop instence %u fail" % (i), ttyonly) return errno.EIO if (self.running() == False): break else: time.sleep(0.01) i = i + 1 def stop(self, ttyonly=False):
def _start(self, ttyonly=False): if (self.role == 'redis'): #优化redis cmd = "eval 'sysctl vm.overcommit_memory=1'" os.system(cmd) #优化redis cmd = "eval 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' 2>/dev/null" os.system(cmd) return cmd = None if self.role == "mond": cmd = "%s -n %s" % (self.config.uss_mond, self.service) elif self.role == "cds": cmd = "%s -n %s" % (self.config.uss_cds, self.service) elif self.role == "nfs": cmd = "systemctl start rpcbind" exec_shell(cmd) cmd = "%s --home %s" % (self.config.uss_ynfs, self.home) elif self.role == "ftp": cmd = "%s --home %s" % (self.config.uss_ftp, self.home) if (self.disk_status): derror(' * %s [disk error]' % (cmd), ttyonly) return 1 if (self.nomount): derror(' * %s [no mount]' % (cmd), ttyonly) return 1 if (self.deleted): derror(' * %s [deleted]' % (cmd), ttyonly) return 1 if (self.skiped): derror(' * %s [skiped]' % (cmd), ttyonly) return 1 if (self.running()): dwarn(' * %s [running]' % (cmd), ttyonly) return 1 if self.config.testing and self.config.valgrind: valgrind = "valgrind --tool=memcheck --leak-check=full --show-reachable=yes -v " logdir = "%s/log/" % (self.config.home) os.system("mkdir -p %s" % (logdir)) vallog = "%s/log/valgrind.%s.%s.log" % (self.config.home, self.role, self.service) cmd = "%s %s -f >>%s 2>&1 &" % (valgrind, cmd, vallog) dmsg(cmd) if (cmd == None): derror(' * %s skip' % (self.home), ttyonly) return subprocess.call(cmd, shell=True, close_fds=True) try: self.__getpid(ttyonly) except Exp, e: dwarn('%s' % (e.err), ttyonly) return e.errno
def _collect_createtime(uss_attr): _exec = "%s -g create_time /system" % (uss_attr) try: create_time, _ = exec_shell(_exec, need_return=True, timeout=10) except Exp, e: derror("%s : %s" % (_exec, str(e)))
def _collect_statvfs_info(uss_statvfs): _exec = "%s /system" % (uss_statvfs) try: output, _ = exec_shell(_exec, need_return=True, timeout=10) except Exp, e: derror("%s : %s" % (_exec, str(e)))
derror("%s : %s" % (_exec_attr, str(e))) sys.exit(-1) return key[0:16] def _collect_mac(host=None): _exec_ls = 'ls %s' % (NET_PATH) try: net_interface, _ = exec_remote(host, _exec_ls) except Exp, e: derror("%s : %s" % (_exec_ls, str(e))) sys.exit(-1) net_list = net_interface.split('\n') if len(net_list) == 0: derror("not found any ethernet interface") sys.exit(errno.ENOENT) for net in net_list: if net == 'lo' \ or net == '.' \ or net == '..' \ or net == '': continue else: address = os.path.join(NET_PATH, net, 'address') _exec_cat = 'cat %s' % (address) try: mac, _ = exec_remote(host, _exec_cat) except Exp, e: derror("%s : %s" % (_exec_cat, str(e))) sys.exit(-1)
def _check_zk(node): zk_fail = True retry = 3 while (retry > 0): try: exec_shell(node.config.uss_zk) zk_fail = False break except Exp, e: derror("zk fence retry %s. %s" % (retry, e)) time.sleep(3) retry = retry - 1 if zk_fail: derror("zk fence fail. %s" % (e)) node.stop() drbd_secondary() node.start() def drbd_fence(): drbd_conf = "/etc/drbd.d/mds.res" if not os.path.isfile(drbd_conf): return None config = Config() node = Node(config) try: _check_dual_mount()
# 确保/nfs-ganesha存在 _exec = '%s %s' % (config.uss_mkdir, share_dir) try: exec_shell(_exec) except Exp, e: if e.errno != errno.EEXIST: derror('mkdir %s failed' % (share_dir)) sys.exit(e.errno) _exec_mount = 'mount -t nfs4 127.0.0.1:%s %s' % (share_dir, mount_point) try: exec_shell(_exec_mount, timeout=10) return True except Exp, e: derror("%s : %s\n" % (_exec_mount, str(e))) return False def umount_nfsv4(mount_point='/mnt/nfs'): _exec = 'umount -l %s' % mount_point try: exec_shell(_exec) except Exp, e: derror('umount %s failed' % (mount_point)) sys.exit(e.errno) def prepare_mount_nfs(config): if not nfs_installed(): sys.exit(-1) # 判断nfs是否正常运行
elif (t == 'all'): target = test_mkdir("/testfile") test.append(File_test(target, length)) """ target = test_mkdir("/testdir") test.append(Dir_test(target, length)) target = test_mkdir("/testattr") test.append(Attr_test(target, length)) target = test_mkdir("/small") target = test_mkdir("/small/testfile") test.append(File_test(target, length)) """ derror("ec test disabled") """ target = test_mkdir("/testdir_ec", ec) test.append(Dir_test(target, length)) target = test_mkdir("/testfile_ec", ec) test.append(File_test(target, length)) target = test_mkdir("/small/testfile_ec", ec) """ else: assert False, 'oops, unhandled option: %s, -h for help' % t exit(1) for i in test: try:
newopts = copy.copy(opts) for o, a in opts: if o in ('--cachedev'): cachedev = a newopts.remove((o, a)) elif o in ('--coredev'): coredev = a newopts.remove((o, a)) elif o in ('--force'): force = True newopts.remove((o, a)) try: bcache_manage = BcacheManage() except Exp, e: derror(e.err) exit(e.errno) for o, a in newopts: if o in ('--help'): usage() exit(0) elif (o == '--bind_cache'): op = o if '--cachedev' in args: idx = args.index('--cachedev') if len(args) <= idx: usage() exit(errno.EINVAL) cachedev = args[idx+1] args.remove(cachedev)
target = test_mkdir(config, "/testdir_ec", ec) test.append(Dir_test(target, length, config)) target = test_mkdir(config, "/testfile_ec", ec) test.append(File_test(target, length, config)) target = test_mkdir(config, "/small/testfile_ec", ec) else: assert False, 'oops, unhandled option: %s, -h for help' % t exit(1) for i in test: try: i.create() except Exp, e: derror(e.err) os.system('kill -9 ' + str(os.getpid())) for i in test: try: i.update() except Exp, e: derror(e.err) os.system('kill -9 ' + str(os.getpid())) for i in test: try: i.check() except Exp, e: derror(e.err) os.system('kill -9 ' + str(os.getpid()))
class Redisd(): def __init__(self, workdir, diskid, disk_idx): self.name = "" self.config = Config() self.uuid = str(uuid.uuid1()) self.workdir = workdir self.diskid = diskid self.localid = int(self.workdir.split('/')[-1]) self.disk_idx = disk_idx self.hostname = socket.gethostname() self.etcd = etcd.Client(host='127.0.0.1', port=2379) self.id = None self.redis_pid = -1 self.lock = False self.volume = None self.running = True self.replica_info = None self.disk_check = time.time() self.__redis_pid(False) os.system('mkdir -p ' + self.workdir + '/run') self.__layout_local() self.__layout_global() def __layout_local(self): config = os.path.join(self.workdir, "config") #dmsg("load local layout") try: self.volume = get_value(config + "/volume") if (self.volume[-1] == '\n'): self.volume = self.volume[:-1] v = get_value(config + "/id") t = v[1:-1].split(',') self.id = (int(t[0]), int(t[1])) v = get_value(config + "/port") self.port_idx = int(v) self.port = str(self.config.redis_baseport + self.port_idx) return True except: self.port = None #dmsg("load local layout fail") return False def __layout_global(self): #dmsg("load global layout") if not self.volume: return False try: #print (self.volume + "/sharding") key = "/sdfs/volume/" + self.volume + "/sharding" #dmsg("etcd read " + key) self.sharding = int(self.etcd.read(key).value) key = "/sdfs/volume/" + self.volume + "/replica" #dmsg("etcd read " + key) self.replica = int(self.etcd.read(key).value) #dmsg("%s sharding %d replica %d" % (self.volume, self.sharding, self.replica)) return True except etcd.EtcdKeyNotFound: return False def __init_redisconf(self, path, hostname): src = os.path.join(self.config.home, "etc/redis.conf.tpl") dist = os.path.join(path, "redis.conf") cmd = "cp " + src + " " + dist os.system(cmd) cmd = "mkdir -p " + self.workdir + "/data" os.system(cmd) #cmd = "chmod a+w " + self.workdir + "/data" #os.system(cmd) _check_config(dist, "port", " ", self.port, True) _check_config(dist, "dir", " ", self.workdir + "/data", True) _check_config(dist, "pidfile", " ", self.workdir + "/run/redis-server.pid", True) _check_config(dist, "logfile", " ", self.workdir + "/redis.log", True) _check_config(dist, "bind", " ", hostname, True) return True def __init_register_port(self, path, hostname): prefix = "/sdfs/redis/%s" % (hostname) if (os.path.exists(path + "/port")): self.port_idx = int(get_value(path + "/port")) self.port = str(self.config.redis_baseport + self.port_idx) return True idx = None for i in range(NODE_PORT): key = prefix + "/port/" + str(i) try: #dmsg("etcd write " + key) self.etcd.write(key, "", prevExist=False) idx = i break except etcd.EtcdAlreadyExist: continue if (idx == None): derror("no port space in " + prefix) return False else: dmsg("register port " + str(idx)) set_value(path + "/port", str(idx)) self.port_idx = idx self.port = str(self.config.redis_baseport + self.port_idx) return True def __init_register_new__(self, slot, replica, addr): key = "/sdfs/volume/%s/slot/%d/redis/%d" % (self.volume, slot, replica) dmsg("set (%s, %s)" % (key, addr)) try: #dmsg("etcd write " + key) self.etcd.write(key, addr, prevExist=False) #dmsg("key %s succss" % (key)) return True except etcd.EtcdAlreadyExist: dmsg("key %s exist" % (key)) return False def __init_register_new(self, path, hostname, reg): if not self.__init_register_port(path, hostname): return False self.hostname = hostname addr = self.__redis_addr() if not self.__init_register_new__(reg[0], reg[1], addr): derror("register fail") return False dmsg("register %s %s " % (self.volume, str(reg))) set_value(path + "/id", str(reg)) set_value(path + "/volume", self.volume) self.id = reg return True def __etcd_create(self, key, value): key = "/sdfs/volume/%s/slot/%d/%s" % (self.volume, self.id[0], key) #dmsg("etcd write " + key) self.etcd.write(key, value, prevExist=False) def __etcd_set(self, key, value): key = "/sdfs/volume/%s/slot/%d/%s" % (self.volume, self.id[0], key) #dmsg("etcd write " + key) self.etcd.write(key, value) def __etcd_get(self, key): key = "/sdfs/volume/%s/slot/%d/%s" % (self.volume, self.id[0], key) #dmsg("etcd read " + key) res = self.etcd.read(key) return res.value def __etcd_delete(self, key): key = "/sdfs/volume/%s/slot/%d/%s" % (self.volume, self.id[0], key) self.etcd.delete(key) def __etcd_update_dbversion(self): key = "/sdfs/volume/%s/slot/%d/%s" % (self.volume, self.id[0], "dbversion") #dmsg("etcd read " + key) res = self.etcd.read(key) idx = res.modifiedIndex version = int(res.value) try: #dmsg("etcd write " + key) res = self.etcd.write(key, version + 1, prevIndex=idx) except etcd.EtcdCompareFailed: derror("%s update %s fail" % (self.workdir, key)) return -1 return version + 1 def redis_stop(self): try: os.kill(self.redis_pid, SIGTERM) except: pass pidfile = os.path.join(self.workdir, 'run/redis-server.pid') cmd = "rm %s > /dev/null 2>&1" % (pidfile) try: #pid = int(get_value(pidfile)) #os.kill(pid, SIGTERM) os.system(cmd) except OSError: pass def __redis_pid(self, force): pidfile = os.path.join(self.workdir, 'run/redis-server.pid') if os.path.exists(pidfile): self.redis_pid = int(get_value(pidfile)) def __redis_start(self): cmd = "redis-server %s/config/redis.conf" % (self.workdir) pidfile = os.path.join(self.workdir, 'run/redis-server.pid') dmsg("start: " + cmd) retry = 0 while (1): os.system(cmd) if os.path.exists(pidfile): try: self.redis_pid = int(get_value(pidfile)) break except ValueError: derror("get pid, value error") time.sleep(0.1) else: dwarn("start %s fail, retry %u\n" % (self.name, retry)) time.sleep(0.2) retry = retry + 1 def __init_redis_master(self, config): try: self.__etcd_create("dbversion", "0") dmsg("create dbversion " + str(self.id)) except etcd.EtcdAlreadyExist: dmsg("dbversion exist") pass #self.__etcd_create("master", self.hostname + " " + self.port) #cmd = "redis-server %s/redis.conf" % (config) #os.system(cmd) #cmd = "redis-cli -h %s -p %s set dbversion 0" % (self.hostname, self.port) #os.system(cmd) #self.redis_stop(self) def __init_redis_slave(self, config): retry = 0 while (self.running): try: self.__etcd_get("dbversion") return True except: if (retry > 100): dwarn("get dbversion " + str(self.id) + " fail") return False time.sleep(0.1) retry = retry + 1 #master = self.__etcd_get("master") #redis_config = config + "/redis.conf" #_check_config(redis_config, "slaveof", " ", master, True) #cmd = "redis-server %s/redis.conf" % (config) #os.system(cmd) def __init_redis(self, config): while (self.running): locked = self.__lock() if (locked): self.__init_redis_master(config) break else: if (self.__init_redis_slave(config)): break return True def __register_get(self): for i in range(self.sharding): for j in range(self.replica): key = "/sdfs/volume/%s/wait/%d/redis/%d.wait" % (self.volume, i, j) try: #dmsg("etcd read " + key) value = self.etcd.read(key).value array = value.split(",") if (array[0] == self.hostname and int(array[1]) == self.disk_idx): dmsg("use %s" % (value)) self.etcd.delete(key) return (i, j) except: continue return None def init(self, volume): self.volume = volume if not self.__layout_global(): dwarn("load global layout fail") return False res = self.__register_get() if (res == None): #dwarn("get register fail") return False dmsg("register %s to volume %s slot(%u, %u)" % (self.workdir, self.volume, res[0], res[1])) path = self.workdir if (os.path.exists(path + "/config")): derror("%s already inited" % (path)) return False config_tmp = os.path.join(path, "config.tmp") config = os.path.join(path, "config") cmd = "mkdir -p " + config_tmp #dmsg(cmd) os.system(cmd) if not self.__init_register_new(config_tmp, socket.gethostname(), res): dwarn("init register fail") return False if not self.__init_redisconf(config_tmp, socket.gethostname()): dwarn("init redis.conf fail") return False if not self.__init_redis(config_tmp): dwarn("init redis fail") return False self.running = False #dmsg("running " + str(self.running)) cmd = "mv " + config_tmp + " " + config os.system(cmd) self.__layout_local() return True def __redis_ismaster(self): #return self.lock.is_acquired #dmsg("%s lock %d, 1" % (self.workdir, self.lock.is_acquired)) #dmsg("%s redis master check" % (self.workdir)) if not self.lock.is_acquired: dwarn("%s not locked" % (self.workdir)) return False cmd = "redis-cli -h %s -p %s info replication | grep role | awk -F ':' '{print $2}'" % ( self.hostname, self.port) #dmsg(cmd) retry = 0 while (self.running): try: res = exec_shell(cmd, need_return=True, p=False) break except Exp, e: dwarn("cmd %s fail\n" % (cmd)) time.sleep(1) retry += 1 continue #dmsg("result " + str(res)) #return self.lock.is_acquired r = res[0] #return True if (r.find("master") != -1): return True else: derror("%s lost master, role %s" % (self.workdir, r)) self.lock.release() return False