def validate_src_share(self, sender_uuid, sname): url = 'https://' if (self.raw is None): a = Appliance.objects.get(uuid=sender_uuid) url = ('%s%s:%s' % (url, a.ip, a.mgmt_port)) self.raw = APIWrapper(client_id=a.client_id, client_secret=a.client_secret, url=url) return self.raw.api_call(url='shares/%s' % sname)
def validate_src_share(self, sender_uuid, sname): url = "https://" if self.raw is None: a = Appliance.objects.get(uuid=sender_uuid) url = "%s%s:%s" % (url, a.ip, a.mgmt_port) self.raw = APIWrapper(client_id=a.client_id, client_secret=a.client_secret, url=url) # TODO: update url to include senders shareId as sname is now invalid return self.raw.api_call(url="shares/%s" % sname)
def run(self): self.law = APIWrapper() try: so = Service.objects.get(name="replication") config_d = json.loads(so.config) self.listener_port = int(config_d["listener_port"]) nco = NetworkConnection.objects.get(name=config_d["network_interface"]) self.listener_interface = nco.ipaddr except NetworkConnection.DoesNotExist: self.listener_interface = "0.0.0.0" except Exception as e: msg = ( "Failed to fetch network interface for Listner/Broker. " "Exception: %s" % e.__str__() ) return logger.error(msg) try: self.uuid = Appliance.objects.get(current_appliance=True).uuid except Exception as e: msg = ( "Failed to get uuid of current appliance. Aborting. " "Exception: %s" % e.__str__() ) return logger.error(msg) ctx = zmq.Context() frontend = ctx.socket(zmq.ROUTER) frontend.set_hwm(10) frontend.bind("tcp://%s:%d" % (self.listener_interface, self.listener_port)) backend = ctx.socket(zmq.ROUTER) backend.bind("ipc://%s" % settings.REPLICATION.get("ipc_socket")) poller = zmq.Poller() poller.register(frontend, zmq.POLLIN) poller.register(backend, zmq.POLLIN) self.local_receivers = {} iterations = 10 poll_interval = 6000 # 6 seconds msg_count = 0 while True: # This loop may still continue even if replication service # is terminated, as long as data is coming in. socks = dict(poller.poll(timeout=poll_interval)) if frontend in socks and socks[frontend] == zmq.POLLIN: address, command, msg = frontend.recv_multipart() if address not in self.remote_senders: self.remote_senders[address] = 1 else: self.remote_senders[address] += 1 msg_count += 1 if msg_count == 1000: msg_count = 0 for rs, count in self.remote_senders.items(): logger.debug( "Active Receiver: %s. Messages processed:" "%d" % (rs, count) ) if command == "sender-ready": logger.debug("initial greeting from %s" % address) # Start a new receiver and send the appropriate response try: start_nr = True if address in self.local_receivers: start_nr = False ecode = self.local_receivers[address].exitcode if ecode is not None: del self.local_receivers[address] logger.debug( "Receiver(%s) exited. exitcode: " "%s. Forcing removal from broker " "list." % (address, ecode) ) start_nr = True else: msg = ( "Receiver(%s) already exists. " "Will not start a new one." % address ) logger.error(msg) # @todo: There may be a different way to handle # this. For example, we can pass the message to # the active receiver and factor into it's # retry/robust logic. But that is for later. frontend.send_multipart( [address, "receiver-init-error", msg] ) if start_nr: nr = Receiver(address, msg) nr.daemon = True nr.start() logger.debug("New Receiver(%s) started." % address) self.local_receivers[address] = nr continue except Exception as e: msg = ( "Exception while starting the " "new receiver for %s: %s" % (address, e.__str__()) ) logger.error(msg) frontend.send_multipart([address, "receiver-init-error", msg]) else: # do we hit hwm? is the dealer still connected? backend.send_multipart([address, command, msg]) elif backend in socks and socks[backend] == zmq.POLLIN: address, command, msg = backend.recv_multipart() if command == "new-send": rid = int(msg) logger.debug("new-send request received for %d" % rid) rcommand = "ERROR" try: replica = Replica.objects.get(id=rid) if replica.enabled: self._process_send(replica) msg = ( "A new Sender started successfully for " "Replication Task(%d)." % rid ) rcommand = "SUCCESS" else: msg = ( "Failed to start a new Sender for " "Replication " "Task(%d) because it is disabled." % rid ) except Exception as e: msg = ( "Failed to start a new Sender for Replication " "Task(%d). Exception: %s" % (rid, e.__str__()) ) logger.error(msg) finally: backend.send_multipart([address, rcommand, str(msg)]) elif address in self.remote_senders: if command in ( "receiver-ready", "receiver-error", "btrfs-recv-finished", ): # noqa E501 logger.debug("Identitiy: %s command: %s" % (address, command)) backend.send_multipart([address, b"ACK", ""]) # a new receiver has started. reply to the sender that # must be waiting frontend.send_multipart([address, command, msg]) else: iterations -= 1 if iterations == 0: iterations = 10 self._prune_senders() self._delete_receivers() cur_time = time.time() if ( self.trail_prune_time is None or (cur_time - self.trail_prune_time) > 3600 ): # prune send/receive trails every hour or so. self.trail_prune_time = cur_time map(self.prune_replica_trail, Replica.objects.filter()) map(self.prune_receive_trail, ReplicaShare.objects.filter()) logger.debug("Replica trails are truncated successfully.") if os.getppid() != self.ppid: logger.error("Parent exited. Aborting.") ctx.destroy() # do some cleanup of senders before quitting? break
def run(self): logger.debug('Id: %s. Starting a new Receiver for meta: %s' % (self.identity, self.meta)) self.msg = ('Top level exception in receiver') latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u'%s' % self.identity) self.dealer.set_hwm(10) self.dealer.connect('ipc://%s' % settings.REPLICATION.get('ipc_socket')) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = ('Failed to get the sender ip for appliance: %s' % self.sender_id) self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if (not self.incremental): self.msg = ('Failed to verify/create share: %s.' % self.sname) self.create_share(self.sname, self.dest_pool) self.msg = ('Failed to create the replica metadata object ' 'for share: %s.' % self.sname) data = { 'share': self.sname, 'appliance': self.sender_ip, 'src_share': self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ('Failed to retreive the replica metadata ' 'object for share: %s.' % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ('Failed to verify latest replication snapshot ' 'on the system.') latest_snap = self._latest_snap(rso) self.msg = ('Failed to create receive trail for rid: %d' % self.rid) data = { 'snap_name': self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = ('Failed to promote the oldest Snapshot to Share.') oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex='_replication_') if (oldest_snap is not None): self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = ('Failed to prune old Snapshots') self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, self.sname)) if (not is_subvol(sub_vol)): self.msg = ('Failed to create parent subvolume %s' % sub_vol) run_command([BTRFS, 'subvolume', 'create', sub_vol]) self.msg = ('Failed to create snapshot directory: %s' % self.snap_dir) run_command(['/usr/bin/mkdir', '-p', self.snap_dir]) snap_fp = ('%s/%s' % (self.snap_dir, self.snap_name)) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if (is_subvol(snap_fp)): logger.debug('Id: %s. Snapshot to be sent(%s) already ' 'exists. Not starting a new receive process' % (self.identity, snap_fp)) self._send_recv('snap-exists') self._sys_exit(0) cmd = [BTRFS, 'receive', self.snap_dir] self.msg = ('Failed to start the low level btrfs receive ' 'command(%s). Aborting.' % cmd) self.rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.msg = ('Failed to send receiver-ready') rcommand, rmsg = self._send_recv('receiver-ready', latest_snap or '') if (rcommand is None): logger.error('Id: %s. No response from the broker for ' 'receiver-ready command. Aborting.' % self.identity) self._sys_exit(3) term_commands = ( 'btrfs-send-init-error', 'btrfs-send-unexpected-termination-error', 'btrfs-send-nonzero-termination-error', ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while (True): socks = dict(self.poll.poll(poll_interval)) if (socks.get(self.dealer) == zmq.POLLIN): # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if (command == 'btrfs-send-stream-finished'): # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if (self.rp.poll() is None): self.msg = ('Failed to terminate btrfs-recv ' 'command') out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.debug('Id: %s. Terminated btrfs-recv. ' 'cmd = %s out = %s err: %s rc: %s' % (self.identity, cmd, out, err, self.rp.returncode)) if (self.rp.returncode != 0): self.msg = ('btrfs-recv exited with unexpected ' 'exitcode(%s). ' % self.rp.returncode) raise Exception(self.msg) data = { 'status': 'succeeded', 'kb_received': self.total_bytes_received / 1024, } self.msg = ('Failed to update receive trail for ' 'rtid: %d' % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv('btrfs-recv-finished') self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receive complete. Total data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) self._sys_exit(0) if (command in term_commands): self.msg = ('Terminal command(%s) received from the ' 'sender. Aborting.' % command) raise Exception(self.msg) if (self.rp.poll() is None): self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b'send-more', '']) num_msgs += 1 self.total_bytes_received += len(message) if (num_msgs == 1000): num_msgs = 0 data = { 'status': 'pending', 'kb_received': self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receiver alive. Data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.error('Id: %s. btrfs-recv died unexpectedly. ' 'cmd: %s out: %s. err: %s' % (self.identity, cmd, out, err)) msg = ( 'Low level system error from btrfs receive ' 'command. cmd: %s out: %s err: %s for rtid: %s' % (cmd, out, err, self.rtid)) data = { 'status': 'failed', 'error': msg, } self.msg = ('Failed to update receive trail for ' 'rtid: %d.' % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ('No response received from the broker. ' 'remaining tries: %d' % num_tries) logger.error('Id: %s. %s' % (self.identity, msg)) if (num_tries == 0): self.msg = ('%s. Terminating the receiver.' % msg) raise Exception(self.msg)
def run(self): self.msg = ('Top level exception in sender: %s' % self.identity) with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self._init_greeting() # create a new replica trail if it's the very first time # or if the last one succeeded self.msg = ('Failed to create local replica trail for snap_name:' ' %s. Aborting.' % self.snap_name) self.rt2 = self.create_replica_trail(self.replica.id, self.snap_name) self.rt2_id = self.rt2['id'] # prune old snapshots. self.update_trail = True self.msg = ('Failed to prune old snapshots') share_path = ( '%s%s/.snapshots/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share)) self._delete_old_snaps(share_path) # Refresh replica trail. if (self.rt is not None): self.rt = self._refresh_rt() # create a snapshot only if it's not already from a previous # failed attempt. self.msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) self.create_snapshot(self.replica.share, self.snap_name) retries_left = 10 poll_interval = 6000 # 6 seconds while (True): socks = dict(self.poll.poll(poll_interval)) if (socks.get(self.send_req) == zmq.POLLIN): # not really necessary because we just want one reply for # now. retries_left = 10 command, reply = self.send_req.recv_multipart() if (command == 'receiver-ready'): if (self.rt is not None): self.rlatest_snap = reply self.rt = self._refresh_rt() logger.debug('Id: %s. command(%s) and message(%s) ' 'received. Proceeding to send fsdata.' % (self.identity, command, reply)) break else: if (command in 'receiver-init-error'): self.msg = ('%s received for %s. extended reply: ' '%s. Aborting.' % (command, self.identity, reply)) elif (command == 'snap-exists'): logger.debug('Id: %s. %s received. Not sending ' 'fsdata' % (self.identity, command)) data = { 'status': 'succeeded', 'error': 'snapshot already exists on the receiver', } # noqa E501 self.msg = ('Failed to update replica status for ' '%s' % self.snap_id) self.update_replica_status(self.rt2_id, data) self._sys_exit(0) else: self.msg = ('unexpected reply(%s) for %s. ' 'extended reply: %s. Aborting' % (command, self.identity, reply)) raise Exception(self.msg) else: retries_left -= 1 logger.debug('Id: %s. No response from receiver. Number ' 'of retry attempts left: %d' % (self.identity, retries_left)) if (retries_left == 0): self.msg = ('Receiver(%s:%d) is unreachable. ' 'Aborting.' % (self.receiver_ip, self.receiver_port)) raise Exception(self.msg) self.send_req.setsockopt(zmq.LINGER, 0) self.send_req.close() self.poll.unregister(self.send_req) self._init_greeting() snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)) cmd = [BTRFS, 'send', snap_path] if (self.rt is not None): prev_snap = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) logger.info('Id: %s. Sending incremental replica between ' '%s -- %s' % (self.identity, prev_snap, snap_path)) cmd = [BTRFS, 'send', '-p', prev_snap, snap_path] else: logger.info('Id: %s. Sending full replica: %s' % (self.identity, snap_path)) try: self.sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(self.sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) except Exception as e: self.msg = ('Failed to start the low level btrfs send ' 'command(%s). Aborting. Exception: ' % (cmd, e.__str__())) logger.error('Id: %s. %s' % (self.identity, self.msg)) self._send_recv('btrfs-send-init-error') self._sys_exit(3) alive = True num_msgs = 0 t0 = time.time() while (alive): try: if (self.sp.poll() is not None): logger.debug( 'Id: %s. send process finished ' 'for %s. rc: %d. stderr: %s' % (self.identity, self.snap_id, self.sp.returncode, self.sp.stderr.read())) alive = False fs_data = self.sp.stdout.read() except IOError: continue except Exception as e: self.msg = ('Exception occured while reading low ' 'level btrfs ' 'send data for %s. Aborting.' % self.snap_id) if (alive): self.sp.terminate() self.update_trail = True self._send_recv('btrfs-send-unexpected-termination-error') self._sys_exit(3) self.msg = ('Failed to send fsdata to the receiver for %s. ' 'Aborting.' % (self.snap_id)) self.update_trail = True command, message = self._send_recv('', fs_data) self.total_bytes_sent += len(fs_data) num_msgs += 1 if (num_msgs == 1000): num_msgs = 0 dsize, drate = self.size_report(self.total_bytes_sent, t0) logger.debug('Id: %s Sender alive. Data transferred: ' '%s. Rate: %s/sec.' % (self.identity, dsize, drate)) if (command is None or command == 'receiver-error'): # command is None when the remote side vanishes. self.msg = ('Got null or error command(%s) message(%s) ' 'from the Receiver while' ' transmitting fsdata. Aborting.' % (command, message)) raise Exception(message) if (not alive): if (self.sp.returncode != 0): # do we mark failed? command, message = self._send_recv( 'btrfs-send-nonzero-termination-error') else: command, message = self._send_recv( 'btrfs-send-stream-finished') if (os.getppid() != self.ppid): logger.error('Id: %s. Scheduler exited. Sender for %s ' 'cannot go on. ' 'Aborting.' % (self.identity, self.snap_id)) self._sys_exit(3) data = { 'status': 'succeeded', 'kb_sent': self.total_bytes_sent / 1024, } self.msg = ('Failed to update final replica status for %s' '. Aborting.' % self.snap_id) self.update_replica_status(self.rt2_id, data) dsize, drate = self.size_report(self.total_bytes_sent, t0) logger.debug('Id: %s. Send complete. Total data transferred: %s.' ' Rate: %s/sec.' % (self.identity, dsize, drate)) self._sys_exit(0)
def run(self): logger.debug("Id: %s. Starting a new Receiver for meta: %s" % (self.identity, self.meta)) self.msg = "Top level exception in receiver" latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u"%s" % self.identity) self.dealer.set_hwm(10) self.dealer.connect("ipc://%s" % settings.REPLICATION.get("ipc_socket")) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = "Failed to get the sender ip for appliance: %s" % self.sender_id self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if not self.incremental: self.msg = "Failed to verify/create share: %s." % self.sname self.create_share(self.sname, self.dest_pool) self.msg = ("Failed to create the replica metadata object " "for share: %s." % self.sname) data = { "share": self.sname, "appliance": self.sender_ip, "src_share": self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ("Failed to retreive the replica metadata " "object for share: %s." % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ( "Failed to verify latest replication snapshot on the system." ) latest_snap = self._latest_snap(rso) self.msg = "Failed to create receive trail for rid: %d" % self.rid data = { "snap_name": self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = "Failed to promote the oldest Snapshot to Share." oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex="_replication_") if oldest_snap is not None: self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = "Failed to prune old Snapshots" self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = "%s%s/%s" % (settings.MNT_PT, self.dest_pool, self.sname) if not is_subvol(sub_vol): self.msg = "Failed to create parent subvolume %s" % sub_vol run_command([BTRFS, "subvolume", "create", sub_vol]) self.msg = "Failed to create snapshot directory: %s" % self.snap_dir run_command(["/usr/bin/mkdir", "-p", self.snap_dir]) snap_fp = "%s/%s" % (self.snap_dir, self.snap_name) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if is_subvol(snap_fp): logger.debug("Id: %s. Snapshot to be sent(%s) already " "exists. Not starting a new receive process" % (self.identity, snap_fp)) self._send_recv("snap-exists") self._sys_exit(0) cmd = [BTRFS, "receive", self.snap_dir] self.msg = ("Failed to start the low level btrfs receive " "command(%s). Aborting." % cmd) self.rp = subprocess.Popen( cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) self.msg = "Failed to send receiver-ready" rcommand, rmsg = self._send_recv("receiver-ready", latest_snap or "") if rcommand is None: logger.error("Id: %s. No response from the broker for " "receiver-ready command. Aborting." % self.identity) self._sys_exit(3) term_commands = ( "btrfs-send-init-error", "btrfs-send-unexpected-termination-error", "btrfs-send-nonzero-termination-error", ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while True: socks = dict(self.poll.poll(poll_interval)) if socks.get(self.dealer) == zmq.POLLIN: # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if command == "btrfs-send-stream-finished": # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if self.rp.poll() is None: self.msg = "Failed to terminate btrfs-recv command" out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.debug("Id: %s. Terminated btrfs-recv. " "cmd = %s out = %s err: %s rc: %s" % (self.identity, cmd, out, err, self.rp.returncode)) if self.rp.returncode != 0: self.msg = ("btrfs-recv exited with unexpected " "exitcode(%s). " % self.rp.returncode) raise Exception(self.msg) data = { "status": "succeeded", "kb_received": self.total_bytes_received / 1024, } self.msg = ( "Failed to update receive trail for rtid: %d" % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv("btrfs-recv-finished") self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receive complete. Total data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) self._sys_exit(0) if command in term_commands: self.msg = ("Terminal command(%s) received from the " "sender. Aborting." % command) raise Exception(self.msg) if self.rp.poll() is None: self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b"send-more", ""]) num_msgs += 1 self.total_bytes_received += len(message) if num_msgs == 1000: num_msgs = 0 data = { "status": "pending", "kb_received": self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receiver alive. Data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.error("Id: %s. btrfs-recv died unexpectedly. " "cmd: %s out: %s. err: %s" % (self.identity, cmd, out, err)) msg = ( "Low level system error from btrfs receive " "command. cmd: %s out: %s err: %s for rtid: %s" % (cmd, out, err, self.rtid)) data = { "status": "failed", "error": msg, } self.msg = ("Failed to update receive trail for " "rtid: %d." % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ("No response received from the broker. " "remaining tries: %d" % num_tries) logger.error("Id: %s. %s" % (self.identity, msg)) if num_tries == 0: self.msg = "%s. Terminating the receiver." % msg raise Exception(self.msg)
def run(self): self.msg = "Top level exception in sender: %s" % self.identity with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self._init_greeting() # create a new replica trail if it's the very first time # or if the last one succeeded self.msg = ("Failed to create local replica trail for snap_name:" " %s. Aborting." % self.snap_name) self.rt2 = self.create_replica_trail(self.replica.id, self.snap_name) self.rt2_id = self.rt2["id"] # prune old snapshots. self.update_trail = True self.msg = "Failed to prune old snapshots" share_path = "%s%s/.snapshots/%s" % ( settings.MNT_PT, self.replica.pool, self.replica.share, ) self._delete_old_snaps(share_path) # Refresh replica trail. if self.rt is not None: self.rt = self._refresh_rt() # create a snapshot only if it's not already from a previous # failed attempt. # TODO: If one does exist we fail which seems harsh as we may be # TODO: able to pickup where we left of depending on the failure. self.msg = "Failed to create snapshot: %s. Aborting." % self.snap_name self.create_snapshot(self.replica.share, self.snap_name) retries_left = settings.REPLICATION.get("max_send_attempts") poll_interval = 6000 # 6 seconds while True: socks = dict(self.poll.poll(poll_interval)) if socks.get(self.send_req) == zmq.POLLIN: # not really necessary because we just want one reply for # now. retries_left = settings.REPLICATION.get( "max_send_attempts") command, reply = self.send_req.recv_multipart() if command == "receiver-ready": if self.rt is not None: self.rlatest_snap = reply self.rt = self._refresh_rt() logger.debug("Id: %s. command(%s) and message(%s) " "received. Proceeding to send fsdata." % (self.identity, command, reply)) break else: if command in "receiver-init-error": self.msg = ("%s received for %s. extended reply: " "%s. Aborting." % (command, self.identity, reply)) elif command == "snap-exists": logger.debug("Id: %s. %s received. Not sending " "fsdata" % (self.identity, command)) data = { "status": "succeeded", "error": "snapshot already exists on the receiver", } # noqa E501 self.msg = ("Failed to update replica status for " "%s" % self.snap_id) self.update_replica_status(self.rt2_id, data) self._sys_exit(0) else: self.msg = ("unexpected reply(%s) for %s. " "extended reply: %s. Aborting" % (command, self.identity, reply)) raise Exception(self.msg) else: retries_left -= 1 logger.debug("Id: %s. No response from receiver. Number " "of retry attempts left: %d" % (self.identity, retries_left)) if retries_left == 0: self.msg = "Receiver(%s:%d) is unreachable. Aborting." % ( self.receiver_ip, self.receiver_port, ) raise Exception(self.msg) self.send_req.setsockopt(zmq.LINGER, 0) self.send_req.close() self.poll.unregister(self.send_req) self._init_greeting() snap_path = "%s%s/.snapshots/%s/%s" % ( settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name, ) cmd = [BTRFS, "send", snap_path] if self.rt is not None: prev_snap = "%s%s/.snapshots/%s/%s" % ( settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name, ) logger.info("Id: %s. Sending incremental replica between " "%s -- %s" % (self.identity, prev_snap, snap_path)) cmd = [BTRFS, "send", "-p", prev_snap, snap_path] else: logger.info("Id: %s. Sending full replica: %s" % (self.identity, snap_path)) try: self.sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(self.sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) except Exception as e: self.msg = ("Failed to start the low level btrfs send " "command(%s). Aborting. Exception: " % (cmd, e.__str__())) logger.error("Id: %s. %s" % (self.identity, self.msg)) self._send_recv("btrfs-send-init-error") self._sys_exit(3) alive = True num_msgs = 0 t0 = time.time() while alive: try: if self.sp.poll() is not None: logger.debug("Id: %s. send process finished " "for %s. rc: %d. stderr: %s" % ( self.identity, self.snap_id, self.sp.returncode, self.sp.stderr.read(), )) alive = False fs_data = self.sp.stdout.read() except IOError: continue except Exception as e: self.msg = ("Exception occurred while reading low " "level btrfs " "send data for %s. Aborting." % self.snap_id) if alive: self.sp.terminate() self.update_trail = True self._send_recv("btrfs-send-unexpected-termination-error") self._sys_exit(3) self.msg = ("Failed to send fsdata to the receiver for %s. " "Aborting." % (self.snap_id)) self.update_trail = True command, message = self._send_recv("", fs_data) self.total_bytes_sent += len(fs_data) num_msgs += 1 if num_msgs == 1000: num_msgs = 0 dsize, drate = self.size_report(self.total_bytes_sent, t0) logger.debug("Id: %s Sender alive. Data transferred: " "%s. Rate: %s/sec." % (self.identity, dsize, drate)) if command is None or command == "receiver-error": # command is None when the remote side vanishes. self.msg = ("Got null or error command(%s) message(%s) " "from the Receiver while" " transmitting fsdata. Aborting." % (command, message)) raise Exception(message) if not alive: if self.sp.returncode != 0: # do we mark failed? command, message = self._send_recv( "btrfs-send-nonzero-termination-error") else: command, message = self._send_recv( "btrfs-send-stream-finished") if os.getppid() != self.ppid: logger.error("Id: %s. Scheduler exited. Sender for %s " "cannot go on. " "Aborting." % (self.identity, self.snap_id)) self._sys_exit(3) data = { "status": "succeeded", "kb_sent": self.total_bytes_sent / 1024, } self.msg = ("Failed to update final replica status for %s" ". Aborting." % self.snap_id) self.update_replica_status(self.rt2_id, data) dsize, drate = self.size_report(self.total_bytes_sent, t0) logger.debug("Id: %s. Send complete. Total data transferred: %s." " Rate: %s/sec." % (self.identity, dsize, drate)) self._sys_exit(0)