Beispiel #1
0
    def service_loop(self):
        """start a RePCe server serving self's server

        stop servicing if a timeout is configured and got no
        keep-alime in that inteval
        """

        if boolify(gconf.use_rsync_xattrs) and not privileged():
            raise GsyncdError(
                "using rsync for extended attributes is not supported")

        repce = RepceServer(
            self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs))
        t = syncdutils.Thread(target=lambda: (repce.service_loop(),
                                              syncdutils.finalize()))
        t.start()
        logging.info("slave listening")
        if gconf.timeout and int(gconf.timeout) > 0:
            while True:
                lp = self.server.last_keep_alive
                time.sleep(int(gconf.timeout))
                if lp == self.server.last_keep_alive:
                    logging.info(
                        "connection inactive for %d seconds, stopping" %
                        int(gconf.timeout))
                    break
        else:
            select((), (), ())
Beispiel #2
0
def startup(**kw):
    """set up logging, pidfile grabbing, daemonization"""
    if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn':
        if not grabpidfile():
            sys.stderr.write("pidfile is taken, exiting.\n")
            sys.exit(2)
        gconf.pid_file_owned = True

    if kw.get('go_daemon') == 'should':
        x, y = os.pipe()
        gconf.cpid = os.fork()
        if gconf.cpid:
            os.close(x)
            sys.exit()
        os.close(y)
        os.setsid()
        dn = os.open(os.devnull, os.O_RDWR)
        for f in (sys.stdin, sys.stdout, sys.stderr):
            os.dup2(dn, f.fileno())
        if getattr(gconf, 'pid_file', None):
            if not grabpidfile(gconf.pid_file + '.tmp'):
                raise GsyncdError("cannot grab temporary pidfile")
            os.rename(gconf.pid_file + '.tmp', gconf.pid_file)
        # wait for parent to terminate
        # so we can start up with
        # no messing from the dirty
        # ol' bustard
        select((x,), (), ())
        os.close(x)

    GLogger._gsyncd_loginit(**kw)
Beispiel #3
0
def startup(**kw):
    """set up logging, pidfile grabbing, daemonization"""
    if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn':
        if not grabpidfile():
            sys.stderr.write("pidfile is taken, exiting.\n")
            sys.exit(2)
        gconf.pid_file_owned = True

    if kw.get('go_daemon') == 'should':
        x, y = os.pipe()
        gconf.cpid = os.fork()
        if gconf.cpid:
            os.close(x)
            sys.exit()
        os.close(y)
        os.setsid()
        dn = os.open(os.devnull, os.O_RDWR)
        for f in (sys.stdin, sys.stdout, sys.stderr):
            os.dup2(dn, f.fileno())
        if getattr(gconf, 'pid_file', None):
            if not grabpidfile(gconf.pid_file + '.tmp'):
                raise GsyncdError("cannot grab temporary pidfile")
            os.rename(gconf.pid_file + '.tmp', gconf.pid_file)
        # wait for parent to terminate
        # so we can start up with
        # no messing from the dirty
        # ol' bustard
        select((x,), (), ())
        os.close(x)

    GLogger._gsyncd_loginit(**kw)
Beispiel #4
0
def startup(go_daemon=True):
    """set up logging, pidfile grabbing, daemonization"""
    pid_file = gconf.get("pid-file")
    if not grabpidfile():
        sys.stderr.write("pidfile is taken, exiting.\n")
        sys.exit(2)
    rconf.pid_file_owned = True

    if not go_daemon:
        return

    x, y = os.pipe()
    cpid = os.fork()
    if cpid:
        os.close(x)
        sys.exit()
    os.close(y)
    os.setsid()
    dn = os.open(os.devnull, os.O_RDWR)
    for f in (sys.stdin, sys.stdout, sys.stderr):
        os.dup2(dn, f.fileno())

    if not grabpidfile(pid_file + '.tmp'):
        raise GsyncdError("cannot grab temporary pidfile")

    os.rename(pid_file + '.tmp', pid_file)

    # wait for parent to terminate
    # so we can start up with
    # no messing from the dirty
    # ol' bustard
    select((x, ), (), ())
    os.close(x)
Beispiel #5
0
 def listen(self):
     while True:
         select((self.inf,), (), ())
         rid, exc, res = recv(self.inf)
         rjob = self.jtab.pop(rid)
         if rjob.cbk:
             rjob.cbk(rjob, [exc, res])
Beispiel #6
0
 def listen(self):
     while True:
         select((self.inf, ), (), ())
         rid, exc, res = recv(self.inf)
         rjob = self.jtab.pop(rid)
         if rjob.cbk:
             rjob.cbk(rjob, [exc, res])
Beispiel #7
0
    def service_loop(self):
        """start a RePCe server serving self's server

        stop servicing if a timeout is configured and got no
        keep-alime in that inteval
        """

        if boolify(gconf.use_rsync_xattrs) and not privileged():
            raise GsyncdError(
                "using rsync for extended attributes is not supported")

        repce = RepceServer(self.server, sys.stdin, sys.stdout,
                            int(gconf.sync_jobs))
        t = syncdutils.Thread(
            target=lambda: (repce.service_loop(), syncdutils.finalize()))
        t.start()
        logging.info("slave listening")
        if gconf.timeout and int(gconf.timeout) > 0:
            while True:
                lp = self.server.last_keep_alive
                time.sleep(int(gconf.timeout))
                if lp == self.server.last_keep_alive:
                    logging.info(
                        "connection inactive for %d seconds, stopping" %
                        int(gconf.timeout))
                    break
        else:
            select((), (), ())
Beispiel #8
0
def startup(go_daemon=True):
    """set up logging, pidfile grabbing, daemonization"""
    pid_file = gconf.get("pid-file")
    if not grabpidfile():
        sys.stderr.write("pidfile is taken, exiting.\n")
        sys.exit(2)
    rconf.pid_file_owned = True

    if not go_daemon:
        return

    x, y = pipe()
    cpid = os.fork()
    if cpid:
        os.close(x)
        sys.exit()
    os.close(y)
    os.setsid()
    dn = os.open(os.devnull, os.O_RDWR)
    for f in (sys.stdin, sys.stdout, sys.stderr):
        os.dup2(dn, f.fileno())

    if not grabpidfile(pid_file + '.tmp'):
        raise GsyncdError("cannot grab temporary pidfile")

    os.rename(pid_file + '.tmp', pid_file)

    # wait for parent to terminate
    # so we can start up with
    # no messing from the dirty
    # ol' bustard
    select((x,), (), ())
    os.close(x)
Beispiel #9
0
    def checkpt_service(self, chan, chkpt, tgt):
        """checkpoint service loop

        monitor and verify checkpoint status for @chkpt, and listen
        for incoming requests for whom we serve a pretty-formatted
        status report"""
        if not chkpt:
            # dummy loop for the case when there is no checkpt set
            while True:
                select([chan], [], [])
                conn, _ = chan.accept()
                conn.send(self.get_extra_info())
                conn.close()
        completed = self._checkpt_param(chkpt, 'completed', xtimish=False)
        if completed:
            completed = tuple(int(x) for x in completed.split('.'))
        while True:
            s,_,_ = select([chan], [], [], (not completed) and 5 or None)
            # either request made and we re-check to not
            # give back stale data, or we still hunting for completion
            if self.native_xtime(tgt) and self.native_xtime(tgt) < self.volmark:
                # indexing has been reset since setting the checkpoint
                status = "is invalid"
            else:
                xtr = self.xtime('.', self.slave)
                if isinstance(xtr, int):
                    raise GsyncdError("slave root directory is unaccessible (%s)",
                                      os.strerror(xtr))
                ncompleted = self.xtime_geq(xtr, tgt)
                if completed and not ncompleted: # stale data
                    logging.warn("completion time %s for checkpoint %s became stale" % \
                                 (self.humantime(*completed), chkpt))
                    completed = None
                    gconf.confdata.delete('checkpoint-completed')
                if ncompleted and not completed: # just reaching completion
                    completed = "%.6f" % time.time()
                    self._set_checkpt_param(chkpt, 'completed', completed, xtimish=False)
                    completed = tuple(int(x) for x in completed.split('.'))
                    logging.info("checkpoint %s completed" % chkpt)
                status = completed and \
                  "completed at " + self.humantime(completed[0]) or \
                  "not reached yet"
            if s:
                conn = None
                try:
                    conn, _ = chan.accept()
                    try:
                        conn.send("  | checkpoint %s %s %s" % (chkpt, status,self.get_extra_info()))
                    except:
                        exc = sys.exc_info()[1]
                        if (isinstance(exc, OSError) or isinstance(exc, IOError)) and \
                           exc.errno == EPIPE:
                            logging.debug('checkpoint client disconnected')
                        else:
                            raise
                finally:
                    if conn:
                        conn.close()
Beispiel #10
0
    def __init__(self, obj, fd_tup):
        (inf, ouf, rw, ww) = fd_tup.split(',')
        repce = RepceServer(obj, int(inf), int(ouf), 1)
        t = syncdutils.Thread(target=lambda: (repce.service_loop(),
                                              syncdutils.finalize()))
        t.start()
        logging.info('Agent listining...')

        select((), (), ())
Beispiel #11
0
    def __init__(self, obj, fd_tup):
        (inf, ouf, rw, ww) = fd_tup.split(',')
        repce = RepceServer(obj, int(inf), int(ouf), 1)
        t = syncdutils.Thread(
            target=lambda: (repce.service_loop(), syncdutils.finalize()))
        t.start()
        logging.info('Agent listining...')

        select((), (), ())
Beispiel #12
0
    def connect_remote(self, go_daemon=None):
        """connect to inner slave url through outer ssh url

        Wrap the connecting utility in ssh.

        Much care is put into daemonizing: in that case
        ssh is started before daemonization, but
        RePCe client is to be created after that (as ssh
        interactive password auth would be defeated by
        a daemonized ssh, while client should be present
        only in the final process). In that case the action
        is taken apart to two parts, this method is ivoked
        once pre-daemon, once post-daemon. Use @go_daemon
        to deiced what part to perform.

        [NB. ATM gluster product does not makes use of interactive
        authentication.]
        """
        if go_daemon == 'done':
            return self.start_fd_client(*self.fd_pair)

        syncdutils.setup_ssh_ctl(tempfile.mkdtemp(prefix='gsyncd-aux-ssh-'),
                                 self.remote_addr, self.inner_rsc.url)

        deferred = go_daemon == 'postconn'
        ret = sup(self,
                  gconf.ssh_command.split() + gconf.ssh_ctl_args +
                  [self.remote_addr],
                  slave=self.inner_rsc.url,
                  deferred=deferred)

        if deferred:
            # send a message to peer so that we can wait for
            # the answer from which we know connection is
            # established and we can proceed with daemonization
            # (doing that too early robs the ssh passwd prompt...)
            # However, we'd better not start the RepceClient
            # before daemonization (that's not preserved properly
            # in daemon), we just do a an ad-hoc linear put/get.
            i, o = ret
            inf = os.fdopen(i)
            repce.send(o, None, '__repce_version__')
            select((inf, ), (), ())
            repce.recv(inf)
            # hack hack hack: store a global reference to the file
            # to save it from getting GC'd which implies closing it
            gconf.permanent_handles.append(inf)
            self.fd_pair = (i, o)
            return 'should'
Beispiel #13
0
    def connect_remote(self, go_daemon=None):
        """connect to inner slave url through outer ssh url

        Wrap the connecting utility in ssh.

        Much care is put into daemonizing: in that case
        ssh is started before daemonization, but
        RePCe client is to be created after that (as ssh
        interactive password auth would be defeated by
        a daemonized ssh, while client should be present
        only in the final process). In that case the action
        is taken apart to two parts, this method is ivoked
        once pre-daemon, once post-daemon. Use @go_daemon
        to deiced what part to perform.

        [NB. ATM gluster product does not makes use of interactive
        authentication.]
        """
        if go_daemon == "done":
            return self.start_fd_client(*self.fd_pair)

        syncdutils.setup_ssh_ctl(tempfile.mkdtemp(prefix="gsyncd-aux-ssh-"), self.remote_addr, self.inner_rsc.url)

        deferred = go_daemon == "postconn"
        ret = sup(
            self,
            gconf.ssh_command.split() + gconf.ssh_ctl_args + [self.remote_addr],
            slave=self.inner_rsc.url,
            deferred=deferred,
        )

        if deferred:
            # send a message to peer so that we can wait for
            # the answer from which we know connection is
            # established and we can proceed with daemonization
            # (doing that too early robs the ssh passwd prompt...)
            # However, we'd better not start the RepceClient
            # before daemonization (that's not preserved properly
            # in daemon), we just do a an ad-hoc linear put/get.
            i, o = ret
            inf = os.fdopen(i)
            repce.send(o, None, "__repce_version__")
            select((inf,), (), ())
            repce.recv(inf)
            # hack hack hack: store a global reference to the file
            # to save it from getting GC'd which implies closing it
            gconf.permanent_handles.append(inf)
            self.fd_pair = (i, o)
            return "should"
Beispiel #14
0
 def tailer():
     while True:
         errstore = cls.errstore.copy()
         try:
             poe, _, _ = select([po.stderr for po in errstore], [], [],
                                1)
         except (ValueError, SelectError):
             continue
         for po in errstore:
             if po.stderr not in poe:
                 continue
             po.lock.acquire()
             try:
                 if po.on_death_row:
                     continue
                 la = errstore[po]
                 try:
                     fd = po.stderr.fileno()
                 except ValueError:  # file is already closed
                     continue
                 l = os.read(fd, 1024)
                 if not l:
                     continue
                 tots = len(l)
                 for lx in la:
                     tots += len(lx)
                 while tots > 1 << 20 and la:
                     tots -= len(la.pop(0))
                 la.append(l)
             finally:
                 po.lock.release()
Beispiel #15
0
 def terminate_geterr(self, fail_on_err=True):
     """kill child, finalize stderr harvesting (unregister
     from errhandler, set up .elines), fail on error if
     asked for
     """
     self.lock.acquire()
     try:
         self.on_death_row = True
     finally:
         self.lock.release()
     elines = self.errstore.pop(self)
     if self.poll() == None:
         self.terminate()
         if self.poll() == None:
             time.sleep(0.1)
             self.kill()
             self.wait()
     while True:
         if not select([self.stderr], [], [], 0.1)[0]:
             break
         b = os.read(self.stderr.fileno(), 1024)
         if b:
             elines.append(b)
         else:
             break
     self.stderr.close()
     self.elines = elines
     if fail_on_err and self.returncode != 0:
         self.errfail()
Beispiel #16
0
 def terminate_geterr(self, fail_on_err = True):
     """kill child, finalize stderr harvesting (unregister
     from errhandler, set up .elines), fail on error if
     asked for
     """
     self.lock.acquire()
     try:
         self.on_death_row = True
     finally:
         self.lock.release()
     elines = self.errstore.pop(self)
     if self.poll() == None:
         self.terminate()
         if self.poll() == None:
             time.sleep(0.1)
             self.kill()
             self.wait()
     while True:
         if not select([self.stderr],[],[],0.1)[0]:
             break
         b = os.read(self.stderr.fileno(), 1024)
         if b:
             elines.append(b)
         else:
             break
     self.stderr.close()
     self.elines = elines
     if fail_on_err and self.returncode != 0:
         self.errfail()
Beispiel #17
0
 def tailer():
     while True:
         errstore = cls.errstore.copy()
         try:
             poe, _, _ = select([po.stderr for po in errstore], [], [], 1)
         except ValueError, selecterror:
             continue
         for po in errstore:
             if po.stderr not in poe:
                 next
             po.lock.acquire()
             try:
                 la = errstore.get(po)
                 if la == None:
                     continue
                 try:
                     fd = po.stderr.fileno()
                 except ValueError:  # file is already closed
                     continue
                 l = os.read(fd, 1024)
                 if not l:
                     continue
                 tots = len(l)
                 for lx in la:
                     tots += len(lx)
                 while tots > 1 << 20 and la:
                     tots -= len(la.pop(0))
                 la.append(l)
             finally:
                 po.lock.release()
Beispiel #18
0
def startup(**kw):
    """set up logging, pidfile grabbing, daemonization"""
    if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn':
        if not grabpidfile():
            sys.stderr.write("pidfile is taken, exiting.\n")
            sys.exit(2)
        gconf.pid_file_owned = True

    if kw.get('go_daemon') == 'should':
        x, y = os.pipe()
        gconf.cpid = os.fork()
        if gconf.cpid:
            os.close(x)
            sys.exit()
        os.close(y)
        os.setsid()
        dn = os.open(os.devnull, os.O_RDWR)
        for f in (sys.stdin, sys.stdout, sys.stderr):
            os.dup2(dn, f.fileno())
        if getattr(gconf, 'pid_file', None):
            if not grabpidfile(gconf.pid_file + '.tmp'):
                raise GsyncdError("cannot grab temporary pidfile")
            os.rename(gconf.pid_file + '.tmp', gconf.pid_file)
        # wait for parent to terminate
        # so we can start up with
        # no messing from the dirty
        # ol' bustard
        select((x,), (), ())
        os.close(x)

    lkw = {}
    if gconf.log_level:
        lkw['level'] = gconf.log_level
    if kw.get('log_file'):
        if kw['log_file'] in ('-', '/dev/stderr'):
            lkw['stream'] = sys.stderr
        elif kw['log_file'] == '/dev/stdout':
            lkw['stream'] = sys.stdout
        else:
            lkw['filename'] = kw['log_file']

    GLogger.setup(label=kw.get('label'), **lkw)

    lkw.update({'saved_label': kw.get('label')})
    gconf.log_metadata = lkw
    gconf.log_exit = True
Beispiel #19
0
    def service_loop(self):
        """start a RePCe server serving self's server

        stop servicing if a timeout is configured and got no
        keep-alime in that inteval
        """
        repce = RepceServer(self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs))
        t = syncdutils.Thread(target=lambda: (repce.service_loop(), syncdutils.finalize()))
        t.start()
        logging.info("slave listening")
        if gconf.timeout and int(gconf.timeout) > 0:
            while True:
                lp = self.server.last_keep_alive
                time.sleep(int(gconf.timeout))
                if lp == self.server.last_keep_alive:
                    logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout))
                    break
        else:
            select((), (), ())
Beispiel #20
0
    def service_loop(self):
        """start a RePCe server serving self's server

        stop servicing if a timeout is configured and got no
        keep-alime in that inteval
        """
        repce = RepceServer(self.server, sys.stdin, sys.stdout,
                            int(gconf.sync_jobs))
        t = syncdutils.Thread(
            target=lambda: (repce.service_loop(), syncdutils.finalize()))
        t.start()
        logging.info("slave listening")
        if gconf.timeout and int(gconf.timeout) > 0:
            while True:
                lp = self.server.last_keep_alive
                time.sleep(int(gconf.timeout))
                if lp == self.server.last_keep_alive:
                    logging.info(
                        "connection inactive for %d seconds, stopping" %
                        int(gconf.timeout))
                    break
        else:
            select((), (), ())
Beispiel #21
0
 def tailer():
     while True:
         for po in select([po.stderr for po in cls.errstore], [], []):
             po.lock.acquire()
             try:
                 la = cls.errstore.get(po)
                 if la == None:
                     continue
                 l = os.read(po.stderr.fileno(), 1024)
                 tots = len(l)
                 for lx in la:
                     tots += len(lx)
                 while tots > 1 << 20 and la:
                     tots -= len(la.pop(0))
             finally:
                 po.lock.release()
 def tailer():
     while True:
         for po in select([po.stderr for po in cls.errstore], [], []):
             po.lock.acquire()
             try:
                 la = cls.errstore.get(po)
                 if la == None:
                     continue
                 l = os.read(po.stderr.fileno(), 1024)
                 tots = len(l)
                 for lx in la:
                     tots += len(lx)
                 while tots > 1<<20 and la:
                     tots -= len(la.pop(0))
             finally:
                 po.lock.release()
Beispiel #23
0
    def monitor(self):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        def sigcont_handler(*a):
            """
            Re-init logging and send group kill signal
            """
            md = gconf.log_metadata
            logging.shutdown()
            lcls = logging.getLoggerClass()
            lcls.setup(label=md.get('saved_label'), **md)
            pid = os.getpid()
            os.kill(-pid, signal.SIGUSR1)
        signal.signal(signal.SIGUSR1, lambda *a: ())
        signal.signal(signal.SIGCONT, sigcont_handler)

        argv = sys.argv[:]
        for o in ('-N', '--no-daemon', '--monitor'):
            while o in argv:
                argv.remove(o)
        argv.extend(('-N', '-p', ''))
        argv.insert(0, os.path.basename(sys.executable))

        self.set_state('starting...')
        ret = 0
        def nwait(p, o=0):
            p2, r = waitpid(p, o)
            if not p2:
                return
            return r
        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))
        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1
        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
            os.close(pw)
            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)
            if so:
                ret = nwait(cpid, os.WNOHANG)
                if ret != None:
                    logging.debug("worker died before establishing connection")
                else:
                    logging.debug("worker seems to be connected (?? racy check)")
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        if ret != None:
                            logging.debug("worker died in startup phase")
                            break
                        time.sleep(1)
            else:
                logging.debug("worker not confirmed in %d sec, aborting it" % \
                              conn_timeout)
                # relax one SIGTERM by setting a handler that sets back
                # standard handler
                set_term_handler(lambda *a: set_term_handler())
                # give a chance to graceful exit
                os.kill(-os.getpid(), signal.SIGTERM)
                time.sleep(1)
                os.kill(cpid, signal.SIGKILL)
                ret = nwait(cpid)
            if ret == None:
                self.set_state('OK')
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0,1):
                    self.set_state('faulty')
            time.sleep(10)
        self.set_state('inconsistent')
        return ret
Beispiel #24
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master,
                suuid, slavenodes):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0]['dir'], None):
            self.status[w[0]['dir']] = GeorepStatus(
                gconf.get("state-file"), w[0]['host'], w[0]['dir'],
                w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol))

        set_monitor_status(gconf.get("state-file"), self.ST_STARTED)
        self.status[w[0]['dir']].set_worker_status(self.ST_INIT)

        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = gconf.get("connection-timeout")
        while ret in (0, 1):
            remote_user, remote_host = w[1][0].split("@")
            remote_id = w[1][1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            current_slave_host = remote_host
            slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port"))

            if (current_slave_host, remote_id) not in slave_up_hosts:
                if len(slave_up_hosts) > 0:
                    remote_new = random.choice(slave_up_hosts)
                    remote_host = "%s@%s" % (remote_user, remote_new[0])
                    remote_id = remote_new[1]

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info(
                lf('starting gsyncd worker',
                   brick=w[0]['dir'],
                   slave_node=remote_host))

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                args_to_agent = argv + [
                    'agent', rconf.args.master, rconf.args.slave,
                    '--local-path', w[0]['dir'], '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'], '--slave-id', suuid,
                    '--rpc-fd', ','.join(
                        [str(ra), str(wa), str(rw),
                         str(ww)])
                ]

                if rconf.args.config_file is not None:
                    args_to_agent += ['-c', rconf.args.config_file]

                if rconf.args.debug:
                    args_to_agent.append("--debug")

                os.execv(sys.executable, args_to_agent)

            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)

                args_to_worker = argv + [
                    'worker', rconf.args.master, rconf.args.slave,
                    '--feedback-fd',
                    str(pw), '--local-path', w[0]['dir'], '--local-node',
                    w[0]['host'], '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid, '--rpc-fd', ','.join(
                        [str(rw), str(ww), str(ra),
                         str(wa)]), '--subvol-num',
                    str(w[2]), '--resource-remote', remote_host,
                    '--resource-remote-id', remote_id
                ]

                if rconf.args.config_file is not None:
                    args_to_worker += ['-c', rconf.args.config_file]

                if w[3]:
                    args_to_worker.append("--is-hottier")

                if rconf.args.debug:
                    args_to_worker.append("--debug")

                os.execv(sys.executable, args_to_worker)

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr, ), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info(
                        lf("Changelog Agent died, Aborting Worker",
                           brick=w[0]['dir']))
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info(
                        lf("worker died before establishing "
                           "connection",
                           brick=w[0]['dir']))
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0]['dir'])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info(
                                lf("worker died in startup phase",
                                   brick=w[0]['dir']))
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info(
                                lf("Changelog Agent died, Aborting "
                                   "Worker",
                                   brick=w[0]['dir']))
                            errno_wrap(os.kill, [cpid, signal.SIGKILL],
                                       [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info(
                    lf(
                        "Worker not confirmed after wait, aborting it. "
                        "Gsyncd invocation on remote slave via SSH or "
                        "gluster master mount might have hung. Please "
                        "check the above logs for exact issue and check "
                        "master or slave volume for errors. Restarting "
                        "master/slave volume accordingly might help.",
                        brick=w[0]['dir'],
                        timeout=conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY)
                    gf_event(EVENT_GEOREP_FAULTY,
                             master_volume=master.volume,
                             master_node=w[0]['host'],
                             master_node_id=w[0]['uuid'],
                             slave_host=slave_host,
                             slave_volume=slave_vol,
                             current_slave_host=current_slave_host,
                             brick_path=w[0]['dir'])
            time.sleep(10)
        self.status[w[0]['dir']].set_worker_status(self.ST_INCON)
        return ret
Beispiel #25
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """

        self.set_state(self.ST_INIT, w)

        ret = 0

        def nwait(p, o=0):
            p2, r = waitpid(p, o)
            if not p2:
                return
            return r

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            remote_host = w[1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)",
                         remote_host)
            if m:
                current_slave_host = m.group(3)
                slave_up_hosts = get_slave_bricks_status(slave_host, slave_vol)

                if current_slave_host not in slave_up_hosts:
                    if len(slave_up_hosts) > 0:
                        remote_host = "%s://%s@%s:%s" % (m.group(1), m.group(
                            2), random.choice(slave_up_hosts), m.group(4))

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.execv(
                    sys.executable, argv + [
                        '--local-path', w[0], '--agent', '--rpc-fd', ','.join(
                            [str(ra), str(wa),
                             str(rw), str(ww)])
                    ])
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.execv(
                    sys.executable, argv + [
                        '--feedback-fd',
                        str(pw), '--local-path', w[0], '--local-id',
                        '.' + escape(w[0]), '--rpc-fd', ','.join([
                            str(rw), str(ww),
                            str(ra), str(wa)
                        ]), '--resource-remote', remote_host
                    ])

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr, ), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0])
                    nwait(apid)  #wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0])
                            nwait(apid)  #wait for agent
                            break
                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0], conn_timeout))
                os.kill(cpid, signal.SIGKILL)
                nwait(apid)  #wait for agent
                ret = nwait(cpid)
            if ret is None:
                self.set_state(self.ST_STABLE, w)
                #If worker dies, agent terminates on EOF.
                #So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.set_state(self.ST_FAULTY, w)
            time.sleep(10)
        self.set_state(self.ST_INCON, w)
        return ret
Beispiel #26
0
    def monitor(self, w, argv, cpids, agents):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """

        if gconf.pause_on_start:
            self.set_state(self.ST_INIT_PAUSE, w)
        else:
            self.set_state(self.ST_INIT, w)

        ret = 0

        def nwait(p, o=0):
            p2, r = waitpid(p, o)
            if not p2:
                return
            return r

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1
        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.execv(sys.executable, argv + ['--local-path', w[0],
                                                 '--agent',
                                                 '--rpc-fd',
                                                 ','.join([str(ra), str(wa),
                                                           str(rw), str(ww)])])
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw),
                                                 '--local-path', w[0],
                                                 '--local-id',
                                                 '.' + escape(w[0]),
                                                 '--rpc-fd',
                                                 ','.join([str(rw), str(ww),
                                                           str(ra), str(wa)]),
                                                 '--resource-remote', w[1]])
            self.lock.acquire()
            cpids.add(cpid)
            agents.add(apid)
            self.lock.release()
            os.close(pw)

            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0])
                else:
                    logging.debug("worker(%s) connected" % w[0])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0])
                            break
                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0], conn_timeout))
                os.kill(cpid, signal.SIGKILL)
                ret = nwait(cpid)
            if ret is None:
                self.set_state(self.ST_STABLE, w)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.set_state(self.ST_FAULTY, w)
            time.sleep(10)
        self.set_state(self.ST_INCON, w)
        return ret
Beispiel #27
0
    def monitor(self, w, argv, cpids):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """

        self.set_state(self.ST_INIT, w)
        ret = 0

        def nwait(p, o=0):
            p2, r = waitpid(p, o)
            if not p2:
                return
            return r

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1
        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw),
                                                 '--local-path', w[0],
                                                 '--local-id',
                                                 '.' + escape(w[0]),
                                                 '--resource-remote', w[1]])
            self.lock.acquire()
            cpids.add(cpid)
            self.lock.release()
            os.close(pw)
            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)
            if so:
                ret = nwait(cpid, os.WNOHANG)
                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0])
                else:
                    logging.debug("worker(%s) connected" % w[0])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0])
                            break
                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0], conn_timeout))
                os.kill(cpid, signal.SIGKILL)
                ret = nwait(cpid)
            if ret is None:
                self.set_state(self.ST_STABLE, w)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.set_state(self.ST_FAULTY, w)
            time.sleep(10)
        self.set_state(self.ST_INCON, w)
        return ret
Beispiel #28
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0], None):
            self.status[w[0]] = GeorepStatus(gconf.state_file, w[0])

        set_monitor_status(gconf.state_file, self.ST_STARTED)
        self.status[w[0]].set_worker_status(self.ST_INIT)

        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            remote_host = w[1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)",
                         remote_host)
            if m:
                current_slave_host = m.group(3)
                slave_up_hosts = get_slave_bricks_status(
                    slave_host, slave_vol)

                if current_slave_host not in slave_up_hosts:
                    if len(slave_up_hosts) > 0:
                        remote_host = "%s://%s@%s:%s" % (m.group(1),
                                                         m.group(2),
                                                         random.choice(
                                                             slave_up_hosts),
                                                         m.group(4))

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                os.execv(sys.executable, argv + ['--local-path', w[0],
                                                 '--agent',
                                                 '--rpc-fd',
                                                 ','.join([str(ra), str(wa),
                                                           str(rw), str(ww)])])
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw),
                                                 '--local-path', w[0],
                                                 '--local-id',
                                                 '.' + escape(w[0]),
                                                 '--rpc-fd',
                                                 ','.join([str(rw), str(ww),
                                                           str(ra), str(wa)]),
                                                 '--subvol-num', str(w[2])] +
                         (['--is-hottier'] if w[3] else []) +
                         ['--resource-remote', remote_host])

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info("Changelog Agent died, "
                                 "Aborting Worker(%s)" % w[0])
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0])
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0])
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info("Changelog Agent died, Aborting "
                                         "Worker(%s)" % w[0])
                            errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0], conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                self.status[w[0]].set_worker_status(self.ST_STABLE)
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]].set_worker_status(self.ST_FAULTY)
            time.sleep(10)
        self.status[w[0]].set_worker_status(self.ST_INCON)
        return ret
Beispiel #29
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master,
                suuid, slavenodes):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0]['dir'], None):
            self.status[w[0]['dir']] = GeorepStatus(gconf.get("state-file"),
                                                    w[0]['host'],
                                                    w[0]['dir'],
                                                    w[0]['uuid'],
                                                    master,
                                                    "%s::%s" % (slave_host,
                                                                slave_vol))
        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child terminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = gconf.get("connection-timeout")
        while ret in (0, 1):
            remote_user, remote_host = w[1][0].split("@")
            remote_id = w[1][1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            current_slave_host = remote_host
            slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port"))

            if (current_slave_host, remote_id) not in slave_up_hosts:
                if len(slave_up_hosts) > 0:
                    remote_new = random.choice(slave_up_hosts)
                    remote_host = "%s@%s" % (remote_user, remote_new[0])
                    remote_id = remote_new[1]

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
            logging.info(lf('starting gsyncd worker',
                            brick=w[0]['dir'],
                            slave_node=remote_host))

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = pipe()
            # read/write end for worker
            (rw, wa) = pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                args_to_agent = argv + [
                    'agent',
                    rconf.args.master,
                    rconf.args.slave,
                    '--local-path', w[0]['dir'],
                    '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid,
                    '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)])
                ]

                if rconf.args.config_file is not None:
                    args_to_agent += ['-c', rconf.args.config_file]

                if rconf.args.debug:
                    args_to_agent.append("--debug")

                os.execv(sys.executable, args_to_agent)

            pr, pw = pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)

                args_to_worker = argv + [
                    'worker',
                    rconf.args.master,
                    rconf.args.slave,
                    '--feedback-fd', str(pw),
                    '--local-path', w[0]['dir'],
                    '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid,
                    '--rpc-fd',
                    ','.join([str(rw), str(ww), str(ra), str(wa)]),
                    '--subvol-num', str(w[2]),
                    '--resource-remote', remote_host,
                    '--resource-remote-id', remote_id
                ]

                if rconf.args.config_file is not None:
                    args_to_worker += ['-c', rconf.args.config_file]

                if w[3]:
                    args_to_worker.append("--is-hottier")

                if rconf.args.debug:
                    args_to_worker.append("--debug")

                access_mount = gconf.get("access-mount")
                if access_mount:
                    os.execv(sys.executable, args_to_worker)
                else:
                    if unshare_propagation_supported():
                        logging.debug("Worker would mount volume privately")
                        unshare_cmd = ['unshare', '-m', '--propagation',
                                       'private']
                        cmd = unshare_cmd + args_to_worker
                        os.execvp("unshare", cmd)
                    else:
                        logging.debug("Mount is not private. It would be lazy"
                                      " umounted")
                        os.execv(sys.executable, args_to_worker)

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info(lf("Changelog Agent died, Aborting Worker",
                                    brick=w[0]['dir']))
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info(lf("worker died before establishing "
                                    "connection",
                                    brick=w[0]['dir']))
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0]['dir'])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info(lf("worker died in startup phase",
                                            brick=w[0]['dir']))
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info(lf("Changelog Agent died, Aborting "
                                            "Worker",
                                            brick=w[0]['dir']))
                            errno_wrap(os.kill, [cpid, signal.SIGKILL],
                                       [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info(
                    lf("Worker not confirmed after wait, aborting it. "
                       "Gsyncd invocation on remote slave via SSH or "
                       "gluster master mount might have hung. Please "
                       "check the above logs for exact issue and check "
                       "master or slave volume for errors. Restarting "
                       "master/slave volume accordingly might help.",
                       brick=w[0]['dir'],
                       timeout=conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY)
                    gf_event(EVENT_GEOREP_FAULTY,
                             master_volume=master.volume,
                             master_node=w[0]['host'],
                             master_node_id=w[0]['uuid'],
                             slave_host=slave_host,
                             slave_volume=slave_vol,
                             current_slave_host=current_slave_host,
                             brick_path=w[0]['dir'])
            time.sleep(10)
        self.status[w[0]['dir']].set_worker_status(self.ST_INCON)
        return ret