def _delete_old_snaps(self, share_name, share_path, num_retain): oldest_snap = get_oldest_snap(share_path, num_retain, regex='_replication_') if (oldest_snap is not None): if (self.delete_snapshot(share_name, oldest_snap)): return self._delete_old_snaps(share_name, share_path, num_retain)
def _delete_old_snaps(self, share_path): oldest_snap = get_oldest_snap(share_path, self.max_snap_retain, regex='_replication_') if (oldest_snap is not None): logger.debug('Id: %s. Deleting old snapshot: %s' % (self.identity, oldest_snap)) self.msg = ('Failed to delete snapshot: %s. Aborting.' % oldest_snap) if (self.delete_snapshot(self.replica.share, oldest_snap)): return self._delete_old_snaps(share_path)
def _delete_old_snaps(self, share_path): oldest_snap = get_oldest_snap(share_path, self.max_snap_retain, regex="_replication_") if oldest_snap is not None: logger.debug("Id: %s. Deleting old snapshot: %s" % (self.identity, oldest_snap)) self.msg = "Failed to delete snapshot: %s. Aborting." % oldest_snap if self.delete_snapshot(self.replica.share, oldest_snap): return self._delete_old_snaps(share_path)
self._sys_exit(3) msg = ( "Timeout occured(60 seconds) while waiting for final " "send confirmation from the receiver(%s) for %s. Aborting." % (self.receiver_ip, self.snap_id) ) with self._update_trail_and_quit(msg): ack = self._process_q() end_ts = datetime.utcnow().replace(tzinfo=utc) data = {"status": "succeeded", "kb_sent": self.kb_sent / 1024, "end_ts": end_ts} if ack["msg"] == "receive_error": msg = "Receiver(%s) returned a processing error for " " %s. Check it for more information." % ( self.receiver_ip, self.snap_id, ) data["status"] = "failed" data["error"] = msg data["send_failed"] = end_ts else: share_path = "%s%s/.snapshots/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share) oldest_snap = get_oldest_snap(share_path, 3) if oldest_snap is not None: msg = "Failed to delete snapshot: %s. Aborting." % oldest_snap with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = "Failed to update final replica status for %s" ". Aborting." % self.snap_id with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
% (self.receiver_ip, self.snap_id)) with self._update_trail_and_quit(msg): ack = self._process_q() end_ts = datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT) data = {'status': 'succeeded', 'kb_sent': self.kb_sent / 1024, 'end_ts': end_ts, } if (ack['msg'] == 'receive_error'): msg = ('Receiver(%s) returned a processing error for ' ' %s. Check it for more information.' % (self.receiver_ip, self.snap_id)) data['status'] = 'failed' data['error'] = msg data['send_failed'] = end_ts else: share_path = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share)) oldest_snap = get_oldest_snap(share_path, self.num_retain_snaps) if (oldest_snap is not None): msg = ('Failed to delete snapshot: %s. Aborting.' % oldest_snap) with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = ('Failed to update final replica status for %s' '. Aborting.' % self.snap_id) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts.strftime(settings.SNAP_TS_FORMAT) #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts.strftime(settings.SNAP_TS_FORMAT), 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)
end_ts = datetime.utcnow().replace(tzinfo=utc).strftime( settings.SNAP_TS_FORMAT) data = { 'status': 'succeeded', 'kb_sent': self.kb_sent / 1024, 'end_ts': end_ts, } if (ack['msg'] == 'receive_error'): msg = ('Receiver(%s) returned a processing error for ' ' %s. Check it for more information.' % (self.receiver_ip, self.snap_id)) data['status'] = 'failed' data['error'] = msg data['send_failed'] = end_ts else: share_path = ( '%s%s/.snapshots/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share)) oldest_snap = get_oldest_snap(share_path, 3) if (oldest_snap is not None): msg = ('Failed to delete snapshot: %s. Aborting.' % oldest_snap) with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = ('Failed to update final replica status for %s' '. Aborting.' % self.snap_id) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts, 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)
def run(self): logger.debug('Id: %s. Starting a new Receiver for meta: %s' % (self.identity, self.meta)) self.msg = ('Top level exception in receiver') latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u'%s' % self.identity) self.dealer.set_hwm(10) self.dealer.connect('ipc://%s' % settings.REPLICATION.get('ipc_socket')) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = ('Failed to get the sender ip for appliance: %s' % self.sender_id) self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if (not self.incremental): self.msg = ('Failed to verify/create share: %s.' % self.sname) self.create_share(self.sname, self.dest_pool) self.msg = ('Failed to create the replica metadata object ' 'for share: %s.' % self.sname) data = { 'share': self.sname, 'appliance': self.sender_ip, 'src_share': self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ('Failed to retreive the replica metadata ' 'object for share: %s.' % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ('Failed to verify latest replication snapshot ' 'on the system.') latest_snap = self._latest_snap(rso) self.msg = ('Failed to create receive trail for rid: %d' % self.rid) data = { 'snap_name': self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = ('Failed to promote the oldest Snapshot to Share.') oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex='_replication_') if (oldest_snap is not None): self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = ('Failed to prune old Snapshots') self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, self.sname)) if (not is_subvol(sub_vol)): self.msg = ('Failed to create parent subvolume %s' % sub_vol) run_command([BTRFS, 'subvolume', 'create', sub_vol]) self.msg = ('Failed to create snapshot directory: %s' % self.snap_dir) run_command(['/usr/bin/mkdir', '-p', self.snap_dir]) snap_fp = ('%s/%s' % (self.snap_dir, self.snap_name)) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if (is_subvol(snap_fp)): logger.debug('Id: %s. Snapshot to be sent(%s) already ' 'exists. Not starting a new receive process' % (self.identity, snap_fp)) self._send_recv('snap-exists') self._sys_exit(0) cmd = [BTRFS, 'receive', self.snap_dir] self.msg = ('Failed to start the low level btrfs receive ' 'command(%s). Aborting.' % cmd) self.rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.msg = ('Failed to send receiver-ready') rcommand, rmsg = self._send_recv('receiver-ready', latest_snap or '') if (rcommand is None): logger.error('Id: %s. No response from the broker for ' 'receiver-ready command. Aborting.' % self.identity) self._sys_exit(3) term_commands = ( 'btrfs-send-init-error', 'btrfs-send-unexpected-termination-error', 'btrfs-send-nonzero-termination-error', ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while (True): socks = dict(self.poll.poll(poll_interval)) if (socks.get(self.dealer) == zmq.POLLIN): # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if (command == 'btrfs-send-stream-finished'): # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if (self.rp.poll() is None): self.msg = ('Failed to terminate btrfs-recv ' 'command') out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.debug('Id: %s. Terminated btrfs-recv. ' 'cmd = %s out = %s err: %s rc: %s' % (self.identity, cmd, out, err, self.rp.returncode)) if (self.rp.returncode != 0): self.msg = ('btrfs-recv exited with unexpected ' 'exitcode(%s). ' % self.rp.returncode) raise Exception(self.msg) data = { 'status': 'succeeded', 'kb_received': self.total_bytes_received / 1024, } self.msg = ('Failed to update receive trail for ' 'rtid: %d' % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv('btrfs-recv-finished') self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receive complete. Total data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) self._sys_exit(0) if (command in term_commands): self.msg = ('Terminal command(%s) received from the ' 'sender. Aborting.' % command) raise Exception(self.msg) if (self.rp.poll() is None): self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b'send-more', '']) num_msgs += 1 self.total_bytes_received += len(message) if (num_msgs == 1000): num_msgs = 0 data = { 'status': 'pending', 'kb_received': self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receiver alive. Data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.error('Id: %s. btrfs-recv died unexpectedly. ' 'cmd: %s out: %s. err: %s' % (self.identity, cmd, out, err)) msg = ( 'Low level system error from btrfs receive ' 'command. cmd: %s out: %s err: %s for rtid: %s' % (cmd, out, err, self.rtid)) data = { 'status': 'failed', 'error': msg, } self.msg = ('Failed to update receive trail for ' 'rtid: %d.' % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ('No response received from the broker. ' 'remaining tries: %d' % num_tries) logger.error('Id: %s. %s' % (self.identity, msg)) if (num_tries == 0): self.msg = ('%s. Terminating the receiver.' % msg) raise Exception(self.msg)
def run(self): logger.debug("Id: %s. Starting a new Receiver for meta: %s" % (self.identity, self.meta)) self.msg = "Top level exception in receiver" latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u"%s" % self.identity) self.dealer.set_hwm(10) self.dealer.connect("ipc://%s" % settings.REPLICATION.get("ipc_socket")) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = "Failed to get the sender ip for appliance: %s" % self.sender_id self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if not self.incremental: self.msg = "Failed to verify/create share: %s." % self.sname self.create_share(self.sname, self.dest_pool) self.msg = ("Failed to create the replica metadata object " "for share: %s." % self.sname) data = { "share": self.sname, "appliance": self.sender_ip, "src_share": self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ("Failed to retreive the replica metadata " "object for share: %s." % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ( "Failed to verify latest replication snapshot on the system." ) latest_snap = self._latest_snap(rso) self.msg = "Failed to create receive trail for rid: %d" % self.rid data = { "snap_name": self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = "Failed to promote the oldest Snapshot to Share." oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex="_replication_") if oldest_snap is not None: self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = "Failed to prune old Snapshots" self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = "%s%s/%s" % (settings.MNT_PT, self.dest_pool, self.sname) if not is_subvol(sub_vol): self.msg = "Failed to create parent subvolume %s" % sub_vol run_command([BTRFS, "subvolume", "create", sub_vol]) self.msg = "Failed to create snapshot directory: %s" % self.snap_dir run_command(["/usr/bin/mkdir", "-p", self.snap_dir]) snap_fp = "%s/%s" % (self.snap_dir, self.snap_name) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if is_subvol(snap_fp): logger.debug("Id: %s. Snapshot to be sent(%s) already " "exists. Not starting a new receive process" % (self.identity, snap_fp)) self._send_recv("snap-exists") self._sys_exit(0) cmd = [BTRFS, "receive", self.snap_dir] self.msg = ("Failed to start the low level btrfs receive " "command(%s). Aborting." % cmd) self.rp = subprocess.Popen( cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) self.msg = "Failed to send receiver-ready" rcommand, rmsg = self._send_recv("receiver-ready", latest_snap or "") if rcommand is None: logger.error("Id: %s. No response from the broker for " "receiver-ready command. Aborting." % self.identity) self._sys_exit(3) term_commands = ( "btrfs-send-init-error", "btrfs-send-unexpected-termination-error", "btrfs-send-nonzero-termination-error", ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while True: socks = dict(self.poll.poll(poll_interval)) if socks.get(self.dealer) == zmq.POLLIN: # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if command == "btrfs-send-stream-finished": # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if self.rp.poll() is None: self.msg = "Failed to terminate btrfs-recv command" out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.debug("Id: %s. Terminated btrfs-recv. " "cmd = %s out = %s err: %s rc: %s" % (self.identity, cmd, out, err, self.rp.returncode)) if self.rp.returncode != 0: self.msg = ("btrfs-recv exited with unexpected " "exitcode(%s). " % self.rp.returncode) raise Exception(self.msg) data = { "status": "succeeded", "kb_received": self.total_bytes_received / 1024, } self.msg = ( "Failed to update receive trail for rtid: %d" % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv("btrfs-recv-finished") self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receive complete. Total data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) self._sys_exit(0) if command in term_commands: self.msg = ("Terminal command(%s) received from the " "sender. Aborting." % command) raise Exception(self.msg) if self.rp.poll() is None: self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b"send-more", ""]) num_msgs += 1 self.total_bytes_received += len(message) if num_msgs == 1000: num_msgs = 0 data = { "status": "pending", "kb_received": self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receiver alive. Data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.error("Id: %s. btrfs-recv died unexpectedly. " "cmd: %s out: %s. err: %s" % (self.identity, cmd, out, err)) msg = ( "Low level system error from btrfs receive " "command. cmd: %s out: %s err: %s for rtid: %s" % (cmd, out, err, self.rtid)) data = { "status": "failed", "error": msg, } self.msg = ("Failed to update receive trail for " "rtid: %d." % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ("No response received from the broker. " "remaining tries: %d" % num_tries) logger.error("Id: %s. %s" % (self.identity, msg)) if num_tries == 0: self.msg = "%s. Terminating the receiver." % msg raise Exception(self.msg)