def distribute(master, slave): if rconf.args.use_gconf_volinfo: mvol = VolinfoFromGconf(master.volume, master=True) else: mvol = Volinfo(master.volume, master.host) logging.debug('master bricks: ' + repr(mvol.bricks)) prelude = [] slave_host = None slave_vol = None prelude = [gconf.get("ssh-command")] + \ gconf.get("ssh-options").split() + \ ["-p", str(gconf.get("ssh-port"))] + \ [slave.remote_addr] logging.debug('slave SSH gateway: ' + slave.remote_addr) if rconf.args.use_gconf_volinfo: svol = VolinfoFromGconf(slave.volume, master=False) else: svol = Volinfo(slave.volume, "localhost", prelude) sbricks = svol.bricks suuid = svol.uuid slave_host = slave.remote_addr.split('@')[-1] slave_vol = slave.volume # save this xattr for the session delete command old_stime_xattr_prefix = gconf.get("stime-xattr-prefix", None) new_stime_xattr_prefix = "trusted.glusterfs." + mvol.uuid + "." + \ svol.uuid if not old_stime_xattr_prefix or \ old_stime_xattr_prefix != new_stime_xattr_prefix: gconf.setconfig("stime-xattr-prefix", new_stime_xattr_prefix) logging.debug('slave bricks: ' + repr(sbricks)) slavenodes = set((b['host'], b["uuid"]) for b in sbricks) rap = SSH.parse_ssh_address(slave) slaves = [(rap['user'] + '@' + h[0], h[1]) for h in slavenodes] workerspex = [] for idx, brick in enumerate(mvol.bricks): if rconf.args.local_node_id == brick['uuid']: is_hot = mvol.is_hot(":".join([brick['host'], brick['dir']])) workerspex.append((brick, slaves[idx % len(slaves)], get_subvol_num(idx, mvol, is_hot), is_hot)) logging.debug('worker specs: ' + repr(workerspex)) return workerspex, suuid, slave_vol, slave_host, master, slavenodes
def subcmd_status(args): from gsyncdstatus import GeorepStatus master_name = args.master.replace(":", "") slave_data = args.slave.replace("ssh://", "") brick_status = GeorepStatus(gconf.get("state-file"), "", args.local_path, "", master_name, slave_data, gconf.get("pid-file")) checkpoint_time = gconf.get("checkpoint", 0) brick_status.print_status(checkpoint_time=checkpoint_time)
def finalize(*args, **kwargs): """all those messy final steps we go trough upon termination Do away with pidfile, ssh control dir and logging. """ final_lock.acquire() if gconf.get('pid_file'): rm_pidf = rconf.pid_file_owned if rconf.cpid: # exit path from parent branch of daemonization rm_pidf = False while True: f = grabpidfile(setpid=False) if not f: # child has already taken over pidfile break if os.waitpid(rconf.cpid, os.WNOHANG)[0] == rconf.cpid: # child has terminated rm_pidf = True break time.sleep(0.1) if rm_pidf: try: os.unlink(rconf.pid_file) except: ex = sys.exc_info()[1] if ex.errno == ENOENT: pass else: raise if rconf.ssh_ctl_dir and not rconf.cpid: def handle_rm_error(func, path, exc_info): if exc_info[1].errno == ENOENT: return raise exc_info[1] shutil.rmtree(rconf.ssh_ctl_dir, onerror=handle_rm_error) """ Unmount if not done """ if rconf.mount_point: if rconf.mountbroker: umount_cmd = rconf.mbr_umount_cmd + [rconf.mount_point, 'lazy'] else: umount_cmd = ['umount', '-l', rconf.mount_point] p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) _, errdata = p0.communicate() if p0.returncode == 0: try: os.rmdir(rconf.mount_point) except OSError: pass else: pass if rconf.log_exit: logging.info("exiting.") sys.stdout.flush() sys.stderr.flush() os._exit(kwargs.get('exval', 0))
def subcmd_monitor_status(args): from gsyncdstatus import set_monitor_status from rconf import rconf set_monitor_status(gconf.get("state-file"), args.status) rconf.log_exit = False logging.info(lf("Monitor Status Change", status=args.status))
def startup(go_daemon=True): """set up logging, pidfile grabbing, daemonization""" pid_file = gconf.get("pid-file") if not grabpidfile(): sys.stderr.write("pidfile is taken, exiting.\n") sys.exit(2) rconf.pid_file_owned = True if not go_daemon: return x, y = os.pipe() cpid = os.fork() if cpid: os.close(x) sys.exit() os.close(y) os.setsid() dn = os.open(os.devnull, os.O_RDWR) for f in (sys.stdin, sys.stdout, sys.stderr): os.dup2(dn, f.fileno()) if not grabpidfile(pid_file + '.tmp'): raise GsyncdError("cannot grab temporary pidfile") os.rename(pid_file + '.tmp', pid_file) # wait for parent to terminate # so we can start up with # no messing from the dirty # ol' bustard select((x, ), (), ()) os.close(x)
def startup(go_daemon=True): """set up logging, pidfile grabbing, daemonization""" pid_file = gconf.get("pid-file") if not grabpidfile(): sys.stderr.write("pidfile is taken, exiting.\n") sys.exit(2) rconf.pid_file_owned = True if not go_daemon: return x, y = pipe() cpid = os.fork() if cpid: os.close(x) sys.exit() os.close(y) os.setsid() dn = os.open(os.devnull, os.O_RDWR) for f in (sys.stdin, sys.stdout, sys.stderr): os.dup2(dn, f.fileno()) if not grabpidfile(pid_file + '.tmp'): raise GsyncdError("cannot grab temporary pidfile") os.rename(pid_file + '.tmp', pid_file) # wait for parent to terminate # so we can start up with # no messing from the dirty # ol' bustard select((x,), (), ()) os.close(x)
def multiplex(self, wspx, suuid, slave_vol, slave_host, master, slavenodes): argv = [os.path.basename(sys.executable), sys.argv[0]] cpids = set() ta = [] for wx in wspx: def wmon(w): cpid, _ = self.monitor(w, argv, cpids, slave_vol, slave_host, master, suuid, slavenodes) time.sleep(1) self.lock.acquire() for cpid in cpids: errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) self.lock.release() finalize(exval=1) t = Thread(target=wmon, args=[wx]) t.start() ta.append(t) # monitor status was being updated in each monitor thread. It # should not be done as it can cause deadlock for a worker start. # set_monitor_status uses flock to synchronize multple instances # updating the file. Since each monitor thread forks worker, # these processes can hold the reference to fd of status # file causing deadlock to workers which starts later as flock # will not be release until all references to same fd is closed. # It will also cause fd leaks. self.lock.acquire() set_monitor_status(gconf.get("state-file"), self.ST_STARTED) self.lock.release() for t in ta: t.join()
def subcmd_status(args): from gsyncdstatus import GeorepStatus master_name = args.master.replace(":", "") slave_data = args.slave.replace("ssh://", "") brick_status = GeorepStatus(gconf.get("state-file"), "", args.local_path, "", master_name, slave_data, gconf.get("pid-file")) checkpoint_time = gconf.get("checkpoint", 0) brick_status.print_status(checkpoint_time=checkpoint_time, json_output=args.json)
def bricks(self): pfx = "master-" if self.master else "slave-" bricks_data = gconf.get(pfx + "bricks") if bricks_data is None: return [] bricks_data = bricks_data.split(",") bricks_data = [b.strip() for b in bricks_data] out = [] for b in bricks_data: parts = b.split(":") b_uuid = None if self.is_uuid(parts[0]): b_uuid = parts[0] # Set all parts except first parts = parts[1:] if self.possible_path(parts[-1]): bpath = parts[-1] # Set all parts except last parts = parts[0:-1] out.append({ "host": ":".join(parts), # if remaining parts are IPv6 name "dir": bpath, "uuid": b_uuid }) return out
def grabpidfile(fname=None, setpid=True): """.grabfile customization for pid files""" if not fname: fname = gconf.get("pid-file") content = None if setpid: content = str(os.getpid()) + '\n' return grabfile(fname, content=content)
def subcmd_delete(args): import logging import shutil import glob import sys from errno import ENOENT, ENODATA import struct from syncdutils import GsyncdError, Xattr, errno_wrap import gsyncdconfig as gconf logging.info('geo-replication delete') # remove the stime xattr from all the brick paths so that # a re-create of a session will start sync all over again stime_xattr_prefix = gconf.get('stime-xattr-prefix', None) # Delete pid file, status file, socket file cleanup_paths = [] cleanup_paths.append(gconf.get("pid-file")) # Cleanup Session dir try: shutil.rmtree(gconf.get("georep-session-working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError('Error while removing working dir: %s' % gconf.get("georep-session-working-dir")) # Cleanup changelog working dirs try: shutil.rmtree(gconf.get("working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError('Error while removing working dir: %s' % gconf.get("working-dir")) for path in cleanup_paths: # To delete temp files for f in glob.glob(path + "*"): _unlink(f) if args.reset_sync_time and stime_xattr_prefix: for p in args.paths: if p != "": # set stime to (0,0) to trigger full volume content resync # to slave on session recreation # look at master.py::Xcrawl hint: zero_zero errno_wrap(Xattr.lsetxattr, (p, stime_xattr_prefix + ".stime", struct.pack("!II", 0, 0)), [ENOENT, ENODATA]) errno_wrap(Xattr.lremovexattr, (p, stime_xattr_prefix + ".entry_stime"), [ENOENT, ENODATA]) return
def subcmd_config_reset(args): import sys try: gconf.resetconfig(config_name_format(args.name)) except gconf.GconfNotConfigurable: cnf_val = gconf.get(config_name_format(args.name), None) if cnf_val is None: sys.stderr.write("Invalid config name \"%s\"\n" % args.name) sys.exit(ERROR_CONFIG_INVALID) # Not configurable sys.stderr.write("Not configurable \"%s\"\n" % args.name) sys.exit(ERROR_CONFIG_NOT_CONFIGURABLE)
def bricks(self): pfx = "master-" if self.master else "slave-" bricks_data = gconf.get(pfx + "bricks") if bricks_data is None: return [] bricks_data = bricks_data.split(",") bricks_data = [b.strip() for b in bricks_data] out = [] for b in bricks_data: parts = b.split(":") bpath = parts[2] if len(parts) == 3 else "" out.append({"host": parts[1], "dir": bpath, "uuid": parts[0]}) return out
def subcmd_config_check(args): import sys try: gconf.check(config_name_format(args.name), value=args.value, with_conffile=False) except gconf.GconfNotConfigurable: cnf_val = gconf.get(config_name_format(args.name), None) if cnf_val is None: sys.stderr.write("Invalid config name \"%s\"\n" % args.name) sys.exit(ERROR_CONFIG_INVALID) # Not configurable sys.stderr.write("Not configurable \"%s\"\n" % args.name) sys.exit(ERROR_CONFIG_NOT_CONFIGURABLE) except gconf.GconfInvalidValue: sys.stderr.write("Invalid config value \"%s=%s\"\n" % (args.name, args.value)) sys.exit(ERROR_CONFIG_INVALID_VALUE)
def multiplex(self, wspx, suuid, slave_vol, slave_host, master, slavenodes): argv = [os.path.basename(sys.executable), sys.argv[0]] cpids = set() agents = set() ta = [] for wx in wspx: def wmon(w): cpid, _ = self.monitor(w, argv, cpids, agents, slave_vol, slave_host, master, suuid, slavenodes) time.sleep(1) self.lock.acquire() for cpid in cpids: errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) for apid in agents: errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH]) self.lock.release() finalize(exval=1) t = Thread(target=wmon, args=[wx]) t.start() ta.append(t) # monitor status was being updated in each monitor thread. It # should not be done as it can cause deadlock for a worker start. # set_monitor_status uses flock to synchronize multple instances # updating the file. Since each monitor thread forks worker and # agent, these processes can hold the reference to fd of status # file causing deadlock to workers which starts later as flock # will not be release until all references to same fd is closed. # It will also cause fd leaks. self.lock.acquire() set_monitor_status(gconf.get("state-file"), self.ST_STARTED) self.lock.release() for t in ta: t.join()
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master, suuid, slavenodes): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0]['dir'], None): self.status[w[0]['dir']] = GeorepStatus( gconf.get("state-file"), w[0]['host'], w[0]['dir'], w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol)) set_monitor_status(gconf.get("state-file"), self.ST_STARTED) self.status[w[0]['dir']].set_worker_status(self.ST_INIT) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child teminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = gconf.get("connection-timeout") while ret in (0, 1): remote_user, remote_host = w[1][0].split("@") remote_id = w[1][1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. current_slave_host = remote_host slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port")) if (current_slave_host, remote_id) not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_new = random.choice(slave_up_hosts) remote_host = "%s@%s" % (remote_user, remote_new[0]) remote_id = remote_new[1] # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() logging.info( lf('starting gsyncd worker', brick=w[0]['dir'], slave_node=remote_host)) # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = os.pipe() # read/write end for worker (rw, wa) = os.pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) args_to_agent = argv + [ 'agent', rconf.args.master, rconf.args.slave, '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join( [str(ra), str(wa), str(rw), str(ww)]) ] if rconf.args.config_file is not None: args_to_agent += ['-c', rconf.args.config_file] if rconf.args.debug: args_to_agent.append("--debug") os.execv(sys.executable, args_to_agent) pr, pw = os.pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) args_to_worker = argv + [ 'worker', rconf.args.master, rconf.args.slave, '--feedback-fd', str(pw), '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join( [str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2]), '--resource-remote', remote_host, '--resource-remote-id', remote_id ] if rconf.args.config_file is not None: args_to_worker += ['-c', rconf.args.config_file] if w[3]: args_to_worker.append("--is-hottier") if rconf.args.debug: args_to_worker.append("--debug") os.execv(sys.executable, args_to_worker) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr, ), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info( lf("Changelog Agent died, Aborting Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info( lf("worker died before establishing " "connection", brick=w[0]['dir'])) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]['dir']) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info( lf("worker died in startup phase", brick=w[0]['dir'])) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info( lf("Changelog Agent died, Aborting " "Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info( lf( "Worker not confirmed after wait, aborting it. " "Gsyncd invocation on remote slave via SSH or " "gluster master mount might have hung. Please " "check the above logs for exact issue and check " "master or slave volume for errors. Restarting " "master/slave volume accordingly might help.", brick=w[0]['dir'], timeout=conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY) gf_event(EVENT_GEOREP_FAULTY, master_volume=master.volume, master_node=w[0]['host'], master_node_id=w[0]['uuid'], slave_host=slave_host, slave_volume=slave_vol, current_slave_host=current_slave_host, brick_path=w[0]['dir']) time.sleep(10) self.status[w[0]['dir']].set_worker_status(self.ST_INCON) return ret
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master, suuid, slavenodes): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0]['dir'], None): self.status[w[0]['dir']] = GeorepStatus(gconf.get("state-file"), w[0]['host'], w[0]['dir'], w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol)) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child terminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = gconf.get("connection-timeout") while ret in (0, 1): remote_user, remote_host = w[1][0].split("@") remote_id = w[1][1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. current_slave_host = remote_host slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port")) if (current_slave_host, remote_id) not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_new = random.choice(slave_up_hosts) remote_host = "%s@%s" % (remote_user, remote_new[0]) remote_id = remote_new[1] # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() self.status[w[0]['dir']].set_worker_status(self.ST_INIT) logging.info(lf('starting gsyncd worker', brick=w[0]['dir'], slave_node=remote_host)) # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = pipe() # read/write end for worker (rw, wa) = pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) args_to_agent = argv + [ 'agent', rconf.args.master, rconf.args.slave, '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)]) ] if rconf.args.config_file is not None: args_to_agent += ['-c', rconf.args.config_file] if rconf.args.debug: args_to_agent.append("--debug") os.execv(sys.executable, args_to_agent) pr, pw = pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) args_to_worker = argv + [ 'worker', rconf.args.master, rconf.args.slave, '--feedback-fd', str(pw), '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join([str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2]), '--resource-remote', remote_host, '--resource-remote-id', remote_id ] if rconf.args.config_file is not None: args_to_worker += ['-c', rconf.args.config_file] if w[3]: args_to_worker.append("--is-hottier") if rconf.args.debug: args_to_worker.append("--debug") access_mount = gconf.get("access-mount") if access_mount: os.execv(sys.executable, args_to_worker) else: if unshare_propagation_supported(): logging.debug("Worker would mount volume privately") unshare_cmd = ['unshare', '-m', '--propagation', 'private'] cmd = unshare_cmd + args_to_worker os.execvp("unshare", cmd) else: logging.debug("Mount is not private. It would be lazy" " umounted") os.execv(sys.executable, args_to_worker) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr,), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info(lf("Changelog Agent died, Aborting Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info(lf("worker died before establishing " "connection", brick=w[0]['dir'])) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]['dir']) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info(lf("worker died in startup phase", brick=w[0]['dir'])) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info(lf("Changelog Agent died, Aborting " "Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info( lf("Worker not confirmed after wait, aborting it. " "Gsyncd invocation on remote slave via SSH or " "gluster master mount might have hung. Please " "check the above logs for exact issue and check " "master or slave volume for errors. Restarting " "master/slave volume accordingly might help.", brick=w[0]['dir'], timeout=conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY) gf_event(EVENT_GEOREP_FAULTY, master_volume=master.volume, master_node=w[0]['host'], master_node_id=w[0]['uuid'], slave_host=slave_host, slave_volume=slave_vol, current_slave_host=current_slave_host, brick_path=w[0]['dir']) time.sleep(10) self.status[w[0]['dir']].set_worker_status(self.ST_INCON) return ret
def main(): rconf.starttime = time.time() # If old Glusterd sends commands in old format, below function # converts the sys.argv to new format. This conversion is added # temporarily for backward compatibility. This can be removed # once integrated with Glusterd2 # This modifies sys.argv globally, so rest of the code works as usual argsupgrade.upgrade() # Default argparse version handler prints to stderr, which is fixed in # 3.x series but not in 2.x, using custom parser to fix this issue if "--version" in sys.argv: print(GSYNCD_VERSION) sys.exit(0) parser = ArgumentParser() parser.add_argument("--inet6", action="store_true") sp = parser.add_subparsers(dest="subcmd") # Monitor Status File update p = sp.add_parser("monitor-status") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("status", help="Update Monitor Status") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Monitor p = sp.add_parser("monitor") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--pause-on-start", action="store_true", help="Start with Paused state") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--debug", action="store_true") p.add_argument("--use-gconf-volinfo", action="store_true") # Worker p = sp.add_parser("worker") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--local-path", help="Local Brick Path") p.add_argument("--feedback-fd", type=int, help="feedback fd between monitor and worker") p.add_argument("--local-node", help="Local master node") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--rpc-fd", help="Read and Write fds for worker-agent communication") p.add_argument("--subvol-num", type=int, help="Subvolume number") p.add_argument("--is-hottier", action="store_true", help="Is this brick part of hot tier") p.add_argument("--resource-remote", help="Remote node to connect to Slave Volume") p.add_argument("--resource-remote-id", help="Remote node ID to connect to Slave Volume") p.add_argument("--slave-id", help="Slave Volume ID") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Agent p = sp.add_parser("agent") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--local-path", help="Local brick path") p.add_argument("--local-node", help="Local master node") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--slave-id", help="Slave Volume ID") p.add_argument("--rpc-fd", help="Read and Write fds for worker-agent communication") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Slave p = sp.add_parser("slave") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--session-owner") p.add_argument("--master-brick", help="Master brick which is connected to the Slave") p.add_argument("--master-node", help="Master node which is connected to the Slave") p.add_argument("--master-node-id", help="Master node ID which is connected to the Slave") p.add_argument("--local-node", help="Local Slave node") p.add_argument("--local-node-id", help="Local Slave ID") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # All configurations which are configured via "slave-" options # DO NOT add default values for these configurations, default values # will be picked from template config file p.add_argument("--slave-timeout", type=int, help="Timeout to end gsyncd at Slave side") p.add_argument("--use-rsync-xattrs", action="store_true") p.add_argument("--slave-log-level", help="Slave Gsyncd Log level") p.add_argument("--slave-gluster-log-level", help="Slave Gluster mount Log level") p.add_argument("--slave-gluster-command-dir", help="Directory where Gluster binaries exist on slave") p.add_argument("--slave-access-mount", action="store_true", help="Do not lazy umount the slave volume") # Status p = sp.add_parser("status") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--local-path", help="Local Brick Path") p.add_argument("--debug", action="store_true") p.add_argument("--json", action="store_true") # Config-check p = sp.add_parser("config-check") p.add_argument("name", help="Config Name") p.add_argument("--value", help="Config Value") p.add_argument("--debug", action="store_true") # Config-get p = sp.add_parser("config-get") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("--name", help="Config Name") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") p.add_argument("--show-defaults", action="store_true") p.add_argument("--only-value", action="store_true") p.add_argument("--use-underscore", action="store_true") p.add_argument("--json", action="store_true") # Config-set p = sp.add_parser("config-set") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-n", "--name", help="Config Name") p.add_argument("-v", "--value", help="Config Value") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Config-reset p = sp.add_parser("config-reset") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("name", help="Config Name") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # voluuidget p = sp.add_parser("voluuidget") p.add_argument("host", help="Hostname") p.add_argument("volname", help="Volume Name") p.add_argument("--debug", action="store_true") # Delete p = sp.add_parser("delete") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-c", "--config-file", help="Config File") p.add_argument('--path', dest='paths', action="append") p.add_argument("--reset-sync-time", action="store_true", help="Reset Sync Time") p.add_argument("--debug", action="store_true") # Parse arguments args = parser.parse_args() # Extra template values, All arguments are already part of template # variables, use this for adding extra variables extra_tmpl_args = {} # Add First/Primary Slave host, user and volume if getattr(args, "slave", None) is not None: hostdata, slavevol = args.slave.split("::") hostdata = hostdata.split("@") slavehost = hostdata[-1] slaveuser = "******" if len(hostdata) == 2: slaveuser = hostdata[0] extra_tmpl_args["primary_slave_host"] = slavehost extra_tmpl_args["slaveuser"] = slaveuser extra_tmpl_args["slavevol"] = slavevol # Add Bricks encoded path if getattr(args, "local_path", None) is not None: extra_tmpl_args["local_id"] = escape(args.local_path) # Add Master Bricks encoded path(For Slave) if getattr(args, "master_brick", None) is not None: extra_tmpl_args["master_brick_id"] = escape(args.master_brick) # Load configurations config_file = getattr(args, "config_file", None) # Subcmd accepts config file argument but not passed # Set default path for config file in that case # If an subcmd accepts config file then it also accepts # master and Slave arguments. if config_file is None and hasattr(args, "config_file"): config_file = "%s/geo-replication/%s_%s_%s/gsyncd.conf" % ( GLUSTERD_WORKDIR, args.master, extra_tmpl_args["primary_slave_host"], extra_tmpl_args["slavevol"]) # If Config file path not exists, log error and continue using default conf config_file_error_msg = None if config_file is not None and not os.path.exists(config_file): # Logging not yet initialized, create the error message to # log later and reset the config_file to None config_file_error_msg = lf( "Session config file not exists, using the default config", path=config_file) config_file = None rconf.config_file = config_file # Override gconf values from argument values only if it is slave gsyncd override_from_args = False if args.subcmd == "slave": override_from_args = True if args.subcmd == "monitor": ret = gconf.is_config_file_old(config_file, args.master, extra_tmpl_args["slavevol"]) if ret is not None: gconf.config_upgrade(config_file, ret) # Load Config file gconf.load(GLUSTERFS_CONFDIR + "/gsyncd.conf", config_file, vars(args), extra_tmpl_args, override_from_args) # Default label to print in log file label = args.subcmd if args.subcmd in ("worker", "agent"): # If Worker or agent, then add brick path also to label label = "%s %s" % (args.subcmd, args.local_path) elif args.subcmd == "slave": # If Slave add Master node and Brick details label = "%s %s%s" % (args.subcmd, args.master_node, args.master_brick) # Setup Logger # Default log file log_file = gconf.get("cli-log-file") log_level = gconf.get("cli-log-level") if getattr(args, "master", None) is not None and \ getattr(args, "slave", None) is not None: log_file = gconf.get("log-file") log_level = gconf.get("log-level") # Use different log file location for Slave log file if args.subcmd == "slave": log_file = gconf.get("slave-log-file") log_level = gconf.get("slave-log-level") if args.debug: log_file = "-" log_level = "DEBUG" # Create Logdir if not exists try: if log_file != "-": os.mkdir(os.path.dirname(log_file)) except OSError as e: if e.errno != EEXIST: raise setup_logging(log_file=log_file, level=log_level, label=label) if config_file_error_msg is not None: logging.warn(config_file_error_msg) # Log message for loaded config file if config_file is not None: logging.info(lf("Using session config file", path=config_file)) set_term_handler() excont = FreeObject(exval=0) # Gets the function name based on the input argument. For example # if subcommand passed as argument is monitor then it looks for # function with name "subcmd_monitor" in subcmds file func = getattr(subcmds, "subcmd_" + args.subcmd.replace("-", "_"), None) try: try: if func is not None: rconf.args = args func(args) except: log_raise_exception(excont) finally: finalize(exval=excont.exval)
def subcmd_delete(args): import logging import shutil import glob import sys from errno import ENOENT, ENODATA import struct from syncdutils import GsyncdError, Xattr, errno_wrap import gsyncdconfig as gconf logging.info('geo-replication delete') # remove the stime xattr from all the brick paths so that # a re-create of a session will start sync all over again stime_xattr_prefix = gconf.get('stime-xattr-prefix', None) # Delete pid file, status file, socket file cleanup_paths = [] cleanup_paths.append(gconf.get("pid-file")) # Cleanup Session dir try: shutil.rmtree(gconf.get("georep-session-working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError( 'Error while removing working dir: %s' % gconf.get("georep-session-working-dir")) # Cleanup changelog working dirs try: shutil.rmtree(gconf.get("working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError( 'Error while removing working dir: %s' % gconf.get("working-dir")) for path in cleanup_paths: # To delete temp files for f in glob.glob(path + "*"): _unlink(f) if args.reset_sync_time and stime_xattr_prefix: for p in args.paths: if p != "": # set stime to (0,0) to trigger full volume content resync # to slave on session recreation # look at master.py::Xcrawl hint: zero_zero errno_wrap(Xattr.lsetxattr, (p, stime_xattr_prefix + ".stime", struct.pack("!II", 0, 0)), [ENOENT, ENODATA]) errno_wrap(Xattr.lremovexattr, (p, stime_xattr_prefix + ".entry_stime"), [ENOENT, ENODATA]) return
def main(): rconf.starttime = time.time() # If old Glusterd sends commands in old format, below function # converts the sys.argv to new format. This conversion is added # temporarily for backward compatibility. This can be removed # once integrated with Glusterd2 # This modifies sys.argv globally, so rest of the code works as usual argsupgrade.upgrade() # Default argparse version handler prints to stderr, which is fixed in # 3.x series but not in 2.x, using custom parser to fix this issue if "--version" in sys.argv: print(GSYNCD_VERSION) sys.exit(0) parser = ArgumentParser() parser.add_argument("--inet6", action="store_true") sp = parser.add_subparsers(dest="subcmd") # Monitor Status File update p = sp.add_parser("monitor-status") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("status", help="Update Monitor Status") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Monitor p = sp.add_parser("monitor") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--pause-on-start", action="store_true", help="Start with Paused state") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--debug", action="store_true") p.add_argument("--use-gconf-volinfo", action="store_true") # Worker p = sp.add_parser("worker") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--local-path", help="Local Brick Path") p.add_argument("--feedback-fd", type=int, help="feedback fd between monitor and worker") p.add_argument("--local-node", help="Local master node") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--rpc-fd", help="Read and Write fds for worker-agent communication") p.add_argument("--subvol-num", type=int, help="Subvolume number") p.add_argument("--is-hottier", action="store_true", help="Is this brick part of hot tier") p.add_argument("--resource-remote", help="Remote node to connect to Slave Volume") p.add_argument("--resource-remote-id", help="Remote node ID to connect to Slave Volume") p.add_argument("--slave-id", help="Slave Volume ID") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Agent p = sp.add_parser("agent") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--local-path", help="Local brick path") p.add_argument("--local-node", help="Local master node") p.add_argument("--local-node-id", help="Local Node ID") p.add_argument("--slave-id", help="Slave Volume ID") p.add_argument("--rpc-fd", help="Read and Write fds for worker-agent communication") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Slave p = sp.add_parser("slave") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave details user@host::vol format") p.add_argument("--session-owner") p.add_argument("--master-brick", help="Master brick which is connected to the Slave") p.add_argument("--master-node", help="Master node which is connected to the Slave") p.add_argument("--master-node-id", help="Master node ID which is connected to the Slave") p.add_argument("--local-node", help="Local Slave node") p.add_argument("--local-node-id", help="Local Slave ID") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # All configurations which are configured via "slave-" options # DO NOT add default values for these configurations, default values # will be picked from template config file p.add_argument("--slave-timeout", type=int, help="Timeout to end gsyncd at Slave side") p.add_argument("--use-rsync-xattrs", action="store_true") p.add_argument("--slave-log-level", help="Slave Gsyncd Log level") p.add_argument("--slave-gluster-log-level", help="Slave Gluster mount Log level") p.add_argument("--slave-gluster-command-dir", help="Directory where Gluster binaries exist on slave") p.add_argument("--slave-access-mount", action="store_true", help="Do not lazy umount the slave volume") # Status p = sp.add_parser("status") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--local-path", help="Local Brick Path") p.add_argument("--debug", action="store_true") p.add_argument("--json", action="store_true") # Config-check p = sp.add_parser("config-check") p.add_argument("name", help="Config Name") p.add_argument("--value", help="Config Value") p.add_argument("--debug", action="store_true") # Config-get p = sp.add_parser("config-get") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("--name", help="Config Name") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") p.add_argument("--show-defaults", action="store_true") p.add_argument("--only-value", action="store_true") p.add_argument("--use-underscore", action="store_true") p.add_argument("--json", action="store_true") # Config-set p = sp.add_parser("config-set") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-n", "--name", help="Config Name") p.add_argument("-v", "--value", help="Config Value") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # Config-reset p = sp.add_parser("config-reset") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("name", help="Config Name") p.add_argument("-c", "--config-file", help="Config File") p.add_argument("--debug", action="store_true") # voluuidget p = sp.add_parser("voluuidget") p.add_argument("host", help="Hostname") p.add_argument("volname", help="Volume Name") p.add_argument("--debug", action="store_true") # Delete p = sp.add_parser("delete") p.add_argument("master", help="Master Volume Name") p.add_argument("slave", help="Slave") p.add_argument("-c", "--config-file", help="Config File") p.add_argument('--path', dest='paths', action="append") p.add_argument("--reset-sync-time", action="store_true", help="Reset Sync Time") p.add_argument("--debug", action="store_true") # Parse arguments args = parser.parse_args() # Extra template values, All arguments are already part of template # variables, use this for adding extra variables extra_tmpl_args = {} # Add First/Primary Slave host, user and volume if getattr(args, "slave", None) is not None: hostdata, slavevol = args.slave.split("::") hostdata = hostdata.split("@") slavehost = hostdata[-1] slaveuser = "******" if len(hostdata) == 2: slaveuser = hostdata[0] extra_tmpl_args["primary_slave_host"] = slavehost extra_tmpl_args["slaveuser"] = slaveuser extra_tmpl_args["slavevol"] = slavevol # Add Bricks encoded path if getattr(args, "local_path", None) is not None: extra_tmpl_args["local_id"] = escape(args.local_path) # Add Master Bricks encoded path(For Slave) if getattr(args, "master_brick", None) is not None: extra_tmpl_args["master_brick_id"] = escape(args.master_brick) # Load configurations config_file = getattr(args, "config_file", None) # Subcmd accepts config file argument but not passed # Set default path for config file in that case # If an subcmd accepts config file then it also accepts # master and Slave arguments. if config_file is None and hasattr(args, "config_file"): config_file = "%s/geo-replication/%s_%s_%s/gsyncd.conf" % ( GLUSTERD_WORKDIR, args.master, extra_tmpl_args["primary_slave_host"], extra_tmpl_args["slavevol"]) # If Config file path not exists, log error and continue using default conf config_file_error_msg = None if config_file is not None and not os.path.exists(config_file): # Logging not yet initialized, create the error message to # log later and reset the config_file to None config_file_error_msg = lf( "Session config file not exists, using the default config", path=config_file) config_file = None rconf.config_file = config_file # Override gconf values from argument values only if it is slave gsyncd override_from_args = False if args.subcmd == "slave": override_from_args = True # Load Config file gconf.load(GLUSTERFS_CONFDIR + "/gsyncd.conf", config_file, vars(args), extra_tmpl_args, override_from_args) # Default label to print in log file label = args.subcmd if args.subcmd in ("worker", "agent"): # If Worker or agent, then add brick path also to label label = "%s %s" % (args.subcmd, args.local_path) elif args.subcmd == "slave": # If Slave add Master node and Brick details label = "%s %s%s" % (args.subcmd, args.master_node, args.master_brick) # Setup Logger # Default log file log_file = gconf.get("cli-log-file") log_level = gconf.get("cli-log-level") if getattr(args, "master", None) is not None and \ getattr(args, "slave", None) is not None: log_file = gconf.get("log-file") log_level = gconf.get("log-level") # Use different log file location for Slave log file if args.subcmd == "slave": log_file = gconf.get("slave-log-file") log_level = gconf.get("slave-log-level") if args.debug: log_file = "-" log_level = "DEBUG" # Create Logdir if not exists try: if log_file != "-": os.mkdir(os.path.dirname(log_file)) except OSError as e: if e.errno != EEXIST: raise setup_logging( log_file=log_file, level=log_level, label=label ) if config_file_error_msg is not None: logging.warn(config_file_error_msg) # Log message for loaded config file if config_file is not None: logging.info(lf("Using session config file", path=config_file)) set_term_handler() excont = FreeObject(exval=0) # Gets the function name based on the input argument. For example # if subcommand passed as argument is monitor then it looks for # function with name "subcmd_monitor" in subcmds file func = getattr(subcmds, "subcmd_" + args.subcmd.replace("-", "_"), None) try: try: if func is not None: rconf.args = args func(args) except: log_raise_exception(excont) finally: finalize(exval=excont.exval)
def distribution_count(self, tier, hot): return gconf.get("master-distribution-count")
def disperse_count(self, tier, hot): return gconf.get("master-disperse-count")
def replica_count(self, tier, hot): return gconf.get("master-replica-count")
def uuid(self): if self.master: return gconf.get("master-volume-id") else: return gconf.get("slave-volume-id")