def _update_trail_and_quit(self, msg): try: yield except Exception, e: logger.error(msg) logger.exception(e) try: data = {"status": "failed", "error": msg, "end_ts": datetime.utcnow().replace(tzinfo=utc)} update_replica_status(self.rt2_id, data, logger) except Exception, e: logger.error("Exception occured in cleanup handler") logger.exception(e)
def _update_trail_and_quit(self, msg): try: yield except Exception, e: logger.error('%s. Exception: %s' % (msg, e.__str__())) try: data = {'status': 'failed', 'error': msg, 'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT), } update_replica_status(self.rt2_id, data, logger) except Exception, e: logger.error('Exception occured in cleanup handler: %s' % e.__str__())
def _update_trail_and_quit(self, msg): try: yield except Exception, e: logger.error(msg) logger.exception(e) try: data = {'status': 'failed', 'error': msg, 'end_ts': datetime.utcnow().replace(tzinfo=utc), } update_replica_status(self.rt2_id, data, logger) except Exception, e: logger.error('Exception occured in cleanup handler') logger.exception(e)
def _update_trail_and_quit(self, msg): try: yield except Exception, e: logger.error(msg) logger.exception(e) try: data = { 'status': 'failed', 'error': msg, 'end_ts': datetime.utcnow().replace(tzinfo=utc), } update_replica_status(self.rt2_id, data, logger) except Exception, e: logger.error('Exception occured in cleanup handler') logger.exception(e)
def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = ('Failed to get replication interface or uuid. ' 'Aborting.') return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if (os.getppid() != self.ppid): logger.error('Parent exited. Aborting.') break while(not self.pubq.empty()): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming num_msgs = 0 while (num_msgs < 1000): try: self.recv_meta = meta_pull.recv_json() num_msgs = num_msgs + 1 snap_id = self.recv_meta['id'] if (self.recv_meta['msg'] == 'begin'): rw = Receiver(self.recv_meta) self.receivers[snap_id] = rw rw.start() elif (snap_id not in self.senders): logger.error('Unknown snap_id(%s) received. Ignoring' % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: break self._prune_workers((self.receivers, self.senders)) if (int(time.time()) - self.prune_time > 3600): self.prune_time = int(time.time()) for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) if (total_sleep >= 60 and len(self.senders) < 50): try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = 'replication' rt2 = ReplicaTrail.objects.filter().order_by('-id') if (len(rt2) != 0): snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1)) else: snap_name = ('%s_1' % snap_name) snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, snap_name)) if (len(rt) == 0): logger.debug('new sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id) elif (rt[0].status == 'succeeded'): if (((now - rt[0].end_ts).total_seconds() > (r.frequency * 60))): logger.debug('incremental sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, rt[0]) else: continue elif (rt[0].status == 'pending'): prev_snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, rt[0].snap_name)) if (prev_snap_id in self.senders): logger.debug('send process ongoing for snap: ' '%s' % snap_id) continue logger.debug('%s not found in senders. Previous ' 'sender must have Aborted. Marking ' 'it as failed' % prev_snap_id) msg = ('Sender process Aborted. See logs for ' 'more information') data = {'status': 'failed', 'end_ts': now.strftime(settings.SNAP_TS_FORMAT), 'error': msg, 'send_failed': now, } update_replica_status(rt[0].id, data, logger) continue elif (rt[0].status == 'failed'): snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if (rto.status != 'failed' or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts): break num_tries = num_tries + 1 if (num_tries >= self.MAX_ATTEMPTS): logger.info('Maximum attempts(%d) reached ' 'for snap: %s. Disabling the ' 'replica.' % (self.MAX_ATTEMPTS, snap_id)) disable_replica(r.id, logger) continue logger.info('previous backup failed for snap: ' '%s. Starting a new one. Attempt ' '%d/%d.' % (snap_id, num_tries, self.MAX_ATTEMPTS)) prev_rt = None for rto in rt: if (rto.status == 'succeeded'): prev_rt = rto break sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, prev_rt) else: logger.error('unknown replica trail status: %s. ' 'ignoring snap: %s' % (rt[0].status, snap_id)) continue self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = ('Error getting the list of enabled replica ' 'tasks. Moving on') logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1
def run(self): msg = "Failed to connect to receiver(%s) on meta port" "(%d) for snap_name: %s. Aborting." % ( self.receiver_ip, self.rmeta_port, self.snap_name, ) with self._clean_exit_handler(msg): meta_push = self.ctx.socket(zmq.PUSH) meta_push.connect("tcp://%s:%d" % (self.receiver_ip, self.rmeta_port)) # 1. create a new replica trail if it's the very first time # of if the last one succeeded msg = "Failed to create local replica trail for snap_name:" " %s. Aborting." % self.snap_name with self._clean_exit_handler(msg): self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger) self.rt2_id = self.rt2["id"] # 2. create a snapshot only if it's not already from a previous # failed attempt. msg = "Failed to create snapshot: %s. Aborting." % self.snap_name with self._clean_exit_handler(msg): create_snapshot(self.replica.share, self.snap_name, logger) # let the receiver know that following diff is coming msg = ( "Failed to send initial metadata communication to the " "receiver(%s), most likely due to a network error. Aborting." % self.receiver_ip ) with self._update_trail_and_quit(msg): meta_push.send_json(self.meta_begin) msg = ( "Timeout occured(60 seconds) while waiting for OK " "from the receiver(%s) to start sending data. Aborting." % self.receiver_ip ) with self._update_trail_and_quit(msg): ack = self._process_q() if ack["msg"] == "snap_exists": data = { "status": "succeeded", "end_ts": datetime.utcnow().replace(tzinfo=utc), "error": "snapshot already exists on the receiver", } msg = "Failed to update replica status for snap_name: %s. " "Aborting." % self.snap_name with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger) self._sys_exit(0) snap_path = "%s%s/.snapshots/%s/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name) cmd = [BTRFS, "send", snap_path] if self.rt is not None: prev_snap = "%s%s/.snapshots/%s/%s" % ( settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name, ) logger.info("Sending incremental replica between %s -- %s" % (prev_snap, snap_path)) cmd = [BTRFS, "send", "-p", prev_snap, snap_path] else: logger.info("Sending full replica: %s" % snap_path) try: sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) except Exception, e: msg = "Failed to start the low level btrfs send " "command(%s). Aborting" % cmd logger.error(msg) logger.exception(e) with self._update_trail_and_quit(msg): self.pub.put("%sEND_FAIL" % self.snap_id) self._sys_exit(3)
self._sys_exit(3) msg = ( "Timeout occured(60 seconds) while waiting for final " "send confirmation from the receiver(%s) for %s. Aborting." % (self.receiver_ip, self.snap_id) ) with self._update_trail_and_quit(msg): ack = self._process_q() end_ts = datetime.utcnow().replace(tzinfo=utc) data = {"status": "succeeded", "kb_sent": self.kb_sent / 1024, "end_ts": end_ts} if ack["msg"] == "receive_error": msg = "Receiver(%s) returned a processing error for " " %s. Check it for more information." % ( self.receiver_ip, self.snap_id, ) data["status"] = "failed" data["error"] = msg data["send_failed"] = end_ts else: share_path = "%s%s/.snapshots/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share) oldest_snap = get_oldest_snap(share_path, 3) if oldest_snap is not None: msg = "Failed to delete snapshot: %s. Aborting." % oldest_snap with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = "Failed to update final replica status for %s" ". Aborting." % self.snap_id with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
def run(self): msg = ('Failed to connect to receiver(%s) on meta port' '(%d) for snap_name: %s. Aborting.' % (self.receiver_ip, self.rmeta_port, self.snap_name)) with self._clean_exit_handler(msg): meta_push = self.ctx.socket(zmq.PUSH) meta_push.connect('tcp://%s:%d' % (self.receiver_ip, self.rmeta_port)) # 1. create a new replica trail if it's the very first time # or if the last one succeeded msg = ('Failed to create local replica trail for snap_name:' ' %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger) self.rt2_id = self.rt2['id'] # 2. create a snapshot only if it's not already from a previous # failed attempt. msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): create_snapshot(self.replica.share, self.snap_name, logger) # let the receiver know that following diff is coming msg = ('Failed to send initial metadata communication to the ' 'receiver(%s), most likely due to a network error. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): meta_push.send_json(self.meta_begin) msg = ('Timeout occured(60 seconds) while waiting for OK ' 'from the receiver(%s) to start sending data. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): ack = self._process_q() if (ack['msg'] == 'snap_exists'): data = {'status': 'succeeded', 'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT), 'error': 'snapshot already exists on the receiver', } msg = ('Failed to update replica status for snap_name: %s. ' 'Aborting.' % self.snap_name) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger) self._sys_exit(0) snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)) cmd = [BTRFS, 'send', snap_path] if (self.rt is not None): prev_snap = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) logger.info('Sending incremental replica between %s -- %s' % (prev_snap, snap_path)) cmd = [BTRFS, 'send', '-p', prev_snap, snap_path] else: logger.info('Sending full replica: %s' % snap_path) try: sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) except Exception, e: msg = ('Failed to start the low level btrfs send ' 'command(%s). Aborting. Exception: ' % (cmd, e.__str__())) logger.error(msg) with self._update_trail_and_quit(msg): self.pub.put('%sEND_FAIL' % self.snap_id) self._sys_exit(3)
def run(self): msg = ('Failed to connect to receiver(%s) on meta port' '(%d) for snap_name: %s. Aborting.' % (self.receiver_ip, self.rmeta_port, self.snap_name)) with self._clean_exit_handler(msg): meta_push = self.ctx.socket(zmq.PUSH) meta_push.connect('tcp://%s:%d' % (self.receiver_ip, self.rmeta_port)) # 1. create a new replica trail if it's the very first time # of if the last one succeeded msg = ('Failed to create local replica trail for snap_name:' ' %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger) self.rt2_id = self.rt2['id'] # 2. create a snapshot only if it's not already from a previous # failed attempt. msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): create_snapshot(self.replica.share, self.snap_name, logger) # let the receiver know that following diff is coming msg = ('Failed to send initial metadata communication to the ' 'receiver(%s), most likely due to a network error. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): meta_push.send_json(self.meta_begin) msg = ('Timeout occured(60 seconds) while waiting for OK ' 'from the receiver(%s) to start sending data. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): ack = self._process_q() if (ack['msg'] == 'snap_exists'): data = { 'status': 'succeeded', 'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime( settings.SNAP_TS_FORMAT), 'error': 'snapshot already exists on the receiver', } msg = ('Failed to update replica status for snap_name: %s. ' 'Aborting.' % self.snap_name) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger) self._sys_exit(0) snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)) cmd = [BTRFS, 'send', snap_path] if (self.rt is not None): prev_snap = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) logger.info('Sending incremental replica between %s -- %s' % (prev_snap, snap_path)) cmd = [BTRFS, 'send', '-p', prev_snap, snap_path] else: logger.info('Sending full replica: %s' % snap_path) try: sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) except Exception, e: msg = ('Failed to start the low level btrfs send ' 'command(%s). Aborting' % cmd) logger.error(msg) logger.exception(e) with self._update_trail_and_quit(msg): self.pub.put('%sEND_FAIL' % self.snap_id) self._sys_exit(3)
end_ts = datetime.utcnow().replace(tzinfo=utc).strftime( settings.SNAP_TS_FORMAT) data = { 'status': 'succeeded', 'kb_sent': self.kb_sent / 1024, 'end_ts': end_ts, } if (ack['msg'] == 'receive_error'): msg = ('Receiver(%s) returned a processing error for ' ' %s. Check it for more information.' % (self.receiver_ip, self.snap_id)) data['status'] = 'failed' data['error'] = msg data['send_failed'] = end_ts else: share_path = ( '%s%s/.snapshots/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share)) oldest_snap = get_oldest_snap(share_path, 3) if (oldest_snap is not None): msg = ('Failed to delete snapshot: %s. Aborting.' % oldest_snap) with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = ('Failed to update final replica status for %s' '. Aborting.' % self.snap_id) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = ('Failed to get replication interface or uuid. ' 'Aborting.') return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if (os.getppid() != self.ppid): logger.error('Parent exited. Aborting.') break while (not self.pubq.empty()): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming try: self.recv_meta = meta_pull.recv_json() snap_id = self.recv_meta['id'] logger.debug('meta received: %s' % self.recv_meta) if (self.recv_meta['msg'] == 'begin'): logger.debug('begin received. meta: %s' % self.recv_meta) rw = Receiver(self.recv_meta, Queue()) self.receivers[snap_id] = rw rw.start() elif (snap_id not in self.senders): logger.error('Unknown snap_id(%s) received. Ignoring' % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: pass self._prune_workers((self.receivers, self.senders)) if (total_sleep >= 60 and len(self.senders) < 50): logger.debug('scanning for replicas') try: for r in Replica.objects.filter(enabled=True): rt = ReplicaTrail.objects.filter( replica=r).order_by('-snapshot_created') now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = ('%s_replica_snap' % r.share) if (len(rt) == 0): snap_name = ('%s_1' % snap_name) logger.debug('new sender for snap: %s' % snap_name) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid) elif (rt[0].status == 'succeeded'): snap_name = ('%s_%d' % (snap_name, rt[0].id + 1)) if ((now - rt[0].end_ts).total_seconds() > r.frequency): logger.debug( 'incremental sender for snap: %s' % snap_name) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, rt[0]) else: logger.debug('its not time yet for ' 'incremental sender for snap: ' '%s' % snap_name) continue elif (rt[0].status == 'pending'): prev_snap_id = ('%s_%s_%s_%s' % (self.rep_ip, r.pool, r.share, rt[0].snap_name)) if (prev_snap_id in self.senders): logger.debug('send process ongoing for snap: ' '%s' % snap_name) continue logger.debug('%s not found in senders. Previous ' 'sender must have Aborted. Marking ' 'it as failed' % prev_snap_id) msg = ('Sender process Aborted. See logs for ' 'more information') data = { 'status': 'failed', 'end_ts': now, 'error': msg, 'send_failed': now, } update_replica_status(rt[0].id, data, logger) continue elif (rt[0].status == 'failed'): snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 MAX_ATTEMPTS = 10 for rto in rt: if (rto.status != 'failed' or num_tries >= MAX_ATTEMPTS or rto.end_ts < r.ts): break num_tries = num_tries + 1 if (num_tries >= MAX_ATTEMPTS): logger.info('Maximum attempts(%d) reached ' 'for snap: %s. Disabling the ' 'replica.' % (MAX_ATTEMPTS, snap_name)) disable_replica(r.id, logger) continue logger.info('previous backup failed for snap: ' '%s. Starting a new one. Attempt ' '%d/%d.' % (snap_name, num_tries, MAX_ATTEMPTS)) prev_rt = None for rto in rt: if (rto.status == 'succeeded'): prev_rt = rto break sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, prev_rt) else: logger.error('unknown replica trail status: %s. ' 'ignoring snap: %s' % (rt[0].status, snap_name)) continue snap_id = ('%s_%s_%s_%s' % (self.rep_ip, r.pool, r.share, snap_name)) self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = ('Error getting the list of enabled replica ' 'tasks. Moving on') logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1
prev_snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, rt[0].snap_name)) if (prev_snap_id in self.senders): logger.debug('send process ongoing for snap: ' '%s' % prev_snap_id) continue logger.debug('%s not found in senders. Previous ' 'sender must have Aborted. Marking ' 'it as failed' % prev_snap_id) msg = ('Sender process Aborted. See logs for ' 'more information') data = {'status': 'failed', 'end_ts': now.strftime(settings.SNAP_TS_FORMAT), 'error': msg, 'send_failed': now, } update_replica_status(rt[0].id, data, logger) continue elif (rt[0].status == 'failed'): snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if (rto.status != 'failed' or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts): break num_tries = num_tries + 1 if (num_tries >= self.MAX_ATTEMPTS): logger.info('Maximum attempts(%d) reached ' 'for snap: %s. Disabling the ' 'replica.' %
def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = "Failed to get replication interface or uuid. " "Aborting." return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind("tcp://%s:%d" % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind("tcp://%s:%d" % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if os.getppid() != self.ppid: logger.error("Parent exited. Aborting.") break while not self.pubq.empty(): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming num_msgs = 0 while num_msgs < 1000: try: self.recv_meta = meta_pull.recv_json() num_msgs = num_msgs + 1 snap_id = self.recv_meta["id"] if self.recv_meta["msg"] == "begin": rw = Receiver(self.recv_meta) self.receivers[snap_id] = rw rw.start() elif snap_id not in self.senders: logger.error("Unknown snap_id(%s) received. Ignoring" % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: break self._prune_workers((self.receivers, self.senders)) if int(time.time()) - self.prune_time > 3600: self.prune_time = int(time.time()) for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) if total_sleep >= 60 and len(self.senders) < 50: try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = "replication" rt2 = ReplicaTrail.objects.filter().order_by("-id") if len(rt2) != 0: snap_name = "%s_%d" % (snap_name, rt2[0].id + 1) else: snap_name = "%s_1" % snap_name snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, snap_name) if len(rt) == 0: logger.debug("new sender for snap: %s" % snap_id) sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, ) elif rt[0].status == "succeeded": if (now - rt[0].end_ts).total_seconds() > (r.frequency * 60): logger.debug("incremental sender for snap: %s" % snap_id) sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, rt[0], ) else: continue elif rt[0].status == "pending": prev_snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, rt[0].snap_name) if prev_snap_id in self.senders: logger.debug("send process ongoing for snap: " "%s" % snap_id) continue logger.debug( "%s not found in senders. Previous " "sender must have Aborted. Marking " "it as failed" % prev_snap_id ) msg = "Sender process Aborted. See logs for " "more information" data = { "status": "failed", "end_ts": now.strftime(settings.SNAP_TS_FORMAT), "error": msg, "send_failed": now, } update_replica_status(rt[0].id, data, logger) continue elif rt[0].status == "failed": snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if rto.status != "failed" or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts: break num_tries = num_tries + 1 if num_tries >= self.MAX_ATTEMPTS: logger.info( "Maximum attempts(%d) reached " "for snap: %s. Disabling the " "replica." % (self.MAX_ATTEMPTS, snap_id) ) disable_replica(r.id, logger) continue logger.info( "previous backup failed for snap: " "%s. Starting a new one. Attempt " "%d/%d." % (snap_id, num_tries, self.MAX_ATTEMPTS) ) prev_rt = None for rto in rt: if rto.status == "succeeded": prev_rt = rto break sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, prev_rt, ) else: logger.error( "unknown replica trail status: %s. " "ignoring snap: %s" % (rt[0].status, snap_id) ) continue self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = "Error getting the list of enabled replica " "tasks. Moving on" logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1