Esempio n. 1
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = ('Failed to get replication interface or uuid. '
                       'Aborting.')
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if (os.getppid() != self.ppid):
                logger.error('Parent exited. Aborting.')
                break

            while(not self.pubq.empty()):
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            num_msgs = 0
            while (num_msgs < 1000):
                try:
                    self.recv_meta = meta_pull.recv_json()
                    num_msgs = num_msgs + 1
                    snap_id = self.recv_meta['id']
                    if (self.recv_meta['msg'] == 'begin'):
                        rw = Receiver(self.recv_meta)
                        self.receivers[snap_id] = rw
                        rw.start()
                    elif (snap_id not in self.senders):
                        logger.error('Unknown snap_id(%s) received. Ignoring'
                                     % snap_id)
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    break

            self._prune_workers((self.receivers, self.senders))

            if (int(time.time()) - self.prune_time > 3600):
                self.prune_time = int(time.time())
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            if (total_sleep >= 60 and len(self.senders) < 50):

                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0,
                                                        microsecond=0,
                                                        tzinfo=utc)
                        sw = None
                        snap_name = 'replication'
                        rt2 = ReplicaTrail.objects.filter().order_by('-id')
                        if (len(rt2) != 0):
                            snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1))
                        else:
                            snap_name = ('%s_1' % snap_name)
                        snap_id = ('%s_%s_%s_%s' %
                                   (self.uuid, r.pool, r.share, snap_name))
                        if (len(rt) == 0):
                            logger.debug('new sender for snap: %s' % snap_id)
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid,
                                        snap_id)
                        elif (rt[0].status == 'succeeded'):
                            if (((now - rt[0].end_ts).total_seconds() >
                                 (r.frequency * 60))):
                                logger.debug('incremental sender for snap: %s'
                                             % snap_id)
                                sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                            snap_name, self.meta_port,
                                            self.data_port, r.meta_port,
                                            self.uuid, snap_id, rt[0])
                            else:
                                continue
                        elif (rt[0].status == 'pending'):
                            prev_snap_id = ('%s_%s_%s_%s' % (self.uuid,
                                            r.pool, r.share, rt[0].snap_name))
                            if (prev_snap_id in self.senders):
                                logger.debug('send process ongoing for snap: '
                                             '%s' % snap_id)
                                continue
                            logger.debug('%s not found in senders. Previous '
                                         'sender must have Aborted. Marking '
                                         'it as failed' % prev_snap_id)
                            msg = ('Sender process Aborted. See logs for '
                                   'more information')
                            data = {'status': 'failed',
                                    'end_ts': now.strftime(settings.SNAP_TS_FORMAT),
                                    'error': msg,
                                    'send_failed': now, }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif (rt[0].status == 'failed'):
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            for rto in rt:
                                if (rto.status != 'failed' or
                                    num_tries >= self.MAX_ATTEMPTS or
                                    rto.end_ts < r.ts):
                                    break
                                num_tries = num_tries + 1
                            if (num_tries >= self.MAX_ATTEMPTS):
                                logger.info('Maximum attempts(%d) reached '
                                            'for snap: %s. Disabling the '
                                            'replica.' %
                                            (self.MAX_ATTEMPTS, snap_id))
                                disable_replica(r.id, logger)
                                continue
                            logger.info('previous backup failed for snap: '
                                        '%s. Starting a new one. Attempt '
                                        '%d/%d.' % (snap_id, num_tries,
                                                    self.MAX_ATTEMPTS))
                            prev_rt = None
                            for rto in rt:
                                if (rto.status == 'succeeded'):
                                    prev_rt = rto
                                    break
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port,
                                        self.uuid, snap_id, prev_rt)
                        else:
                            logger.error('unknown replica trail status: %s. '
                                         'ignoring snap: %s' %
                                         (rt[0].status, snap_id))
                            continue
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = ('Error getting the list of enabled replica '
                             'tasks. Moving on')
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1
Esempio n. 2
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = ('Failed to get replication interface or uuid. '
                       'Aborting.')
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if (os.getppid() != self.ppid):
                logger.error('Parent exited. Aborting.')
                break

            while (not self.pubq.empty()):
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            num_msgs = 0
            while (num_msgs < 1000):
                try:
                    self.recv_meta = meta_pull.recv_json()
                    num_msgs = num_msgs + 1
                    snap_id = self.recv_meta['id']
                    if (self.recv_meta['msg'] == 'begin'):
                        rw = Receiver(self.recv_meta)
                        self.receivers[snap_id] = rw
                        rw.start()
                    elif (snap_id not in self.senders):
                        logger.error('Unknown snap_id(%s) received. Ignoring' %
                                     snap_id)
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    break

            self._prune_workers((self.receivers, self.senders))

            if (int(time.time()) - self.prune_time > 3600):
                self.prune_time = int(time.time())
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            if (total_sleep >= 60 and len(self.senders) < 50):

                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0,
                                                        microsecond=0,
                                                        tzinfo=utc)
                        sw = None
                        snap_name = 'replication'
                        rt2 = ReplicaTrail.objects.filter().order_by('-id')
                        if (len(rt2) != 0):
                            snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1))
                        else:
                            snap_name = ('%s_1' % snap_name)
                        snap_id = ('%s_%s_%s_%s' %
                                   (self.uuid, r.pool, r.share, snap_name))
                        if (len(rt) == 0):
                            logger.debug('new sender for snap: %s' % snap_id)
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid,
                                        snap_id)
                        elif (rt[0].status == 'succeeded'):
                            if (((now - rt[0].end_ts).total_seconds() >
                                 (r.frequency * 60))):
                                logger.debug(
                                    'incremental sender for snap: %s' %
                                    snap_id)
                                sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                            snap_name, self.meta_port,
                                            self.data_port, r.meta_port,
                                            self.uuid, snap_id, rt[0])
                            else:
                                continue
                        elif (rt[0].status == 'pending'):
                            prev_snap_id = (
                                '%s_%s_%s_%s' %
                                (self.uuid, r.pool, r.share, rt[0].snap_name))
                            if (prev_snap_id in self.senders):
                                logger.debug('send process ongoing for snap: '
                                             '%s' % snap_id)
                                continue
                            logger.debug('%s not found in senders. Previous '
                                         'sender must have Aborted. Marking '
                                         'it as failed' % prev_snap_id)
                            msg = ('Sender process Aborted. See logs for '
                                   'more information')
                            data = {
                                'status': 'failed',
                                'end_ts':
                                now.strftime(settings.SNAP_TS_FORMAT),
                                'error': msg,
                                'send_failed': now,
                            }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif (rt[0].status == 'failed'):
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            for rto in rt:
                                if (rto.status != 'failed'
                                        or num_tries >= self.MAX_ATTEMPTS
                                        or rto.end_ts < r.ts):
                                    break
                                num_tries = num_tries + 1
                            if (num_tries >= self.MAX_ATTEMPTS):
                                logger.info('Maximum attempts(%d) reached '
                                            'for snap: %s. Disabling the '
                                            'replica.' %
                                            (self.MAX_ATTEMPTS, snap_id))
                                disable_replica(r.id, logger)
                                continue
                            logger.info(
                                'previous backup failed for snap: '
                                '%s. Starting a new one. Attempt '
                                '%d/%d.' %
                                (snap_id, num_tries, self.MAX_ATTEMPTS))
                            prev_rt = None
                            for rto in rt:
                                if (rto.status == 'succeeded'):
                                    prev_rt = rto
                                    break
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid,
                                        snap_id, prev_rt)
                        else:
                            logger.error('unknown replica trail status: %s. '
                                         'ignoring snap: %s' %
                                         (rt[0].status, snap_id))
                            continue
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = ('Error getting the list of enabled replica '
                             'tasks. Moving on')
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1
Esempio n. 3
0
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    #recv_json throws this exception if nothing is received
                    #for 100 milliseconds. Break, do other stuff and come back here.
                    break

            self._prune_workers((self.receivers, self.senders))

            if (int(time.time()) - self.prune_time > self.trail_prune_interval):
                #trail objects keep accumulating and may grow to be quite large.
                #so once every trail_prune_interval seconds, deleted the old ones
                #by default the prune helper methods delete records older than 7 days
                self.prune_time = int(time.time()) #reset
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            #seconds spent processing messages at once. should be counted as
            #part of sleep time so new senders are not stalled.
            total_sleep += int(time.time() - t0)
            if (total_sleep >= self.sender_check_interval and
                len(self.senders) < self.max_senders):
                total_sleep = 0 #reset
                #check to see if we can start any new senders.
                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0,
                                                        microsecond=0,
Esempio n. 4
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = "Failed to get replication interface or uuid. " "Aborting."
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind("tcp://%s:%d" % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind("tcp://%s:%d" % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if os.getppid() != self.ppid:
                logger.error("Parent exited. Aborting.")
                break

            while not self.pubq.empty():
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            num_msgs = 0
            while num_msgs < 1000:
                try:
                    self.recv_meta = meta_pull.recv_json()
                    num_msgs = num_msgs + 1
                    snap_id = self.recv_meta["id"]
                    if self.recv_meta["msg"] == "begin":
                        rw = Receiver(self.recv_meta)
                        self.receivers[snap_id] = rw
                        rw.start()
                    elif snap_id not in self.senders:
                        logger.error("Unknown snap_id(%s) received. Ignoring" % snap_id)
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    break

            self._prune_workers((self.receivers, self.senders))

            if int(time.time()) - self.prune_time > 3600:
                self.prune_time = int(time.time())
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            if total_sleep >= 60 and len(self.senders) < 50:

                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc)
                        sw = None
                        snap_name = "replication"
                        rt2 = ReplicaTrail.objects.filter().order_by("-id")
                        if len(rt2) != 0:
                            snap_name = "%s_%d" % (snap_name, rt2[0].id + 1)
                        else:
                            snap_name = "%s_1" % snap_name
                        snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, snap_name)
                        if len(rt) == 0:
                            logger.debug("new sender for snap: %s" % snap_id)
                            sw = Sender(
                                r,
                                self.rep_ip,
                                self.pubq,
                                Queue(),
                                snap_name,
                                self.meta_port,
                                self.data_port,
                                r.meta_port,
                                self.uuid,
                                snap_id,
                            )
                        elif rt[0].status == "succeeded":
                            if (now - rt[0].end_ts).total_seconds() > (r.frequency * 60):
                                logger.debug("incremental sender for snap: %s" % snap_id)
                                sw = Sender(
                                    r,
                                    self.rep_ip,
                                    self.pubq,
                                    Queue(),
                                    snap_name,
                                    self.meta_port,
                                    self.data_port,
                                    r.meta_port,
                                    self.uuid,
                                    snap_id,
                                    rt[0],
                                )
                            else:
                                continue
                        elif rt[0].status == "pending":
                            prev_snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, rt[0].snap_name)
                            if prev_snap_id in self.senders:
                                logger.debug("send process ongoing for snap: " "%s" % snap_id)
                                continue
                            logger.debug(
                                "%s not found in senders. Previous "
                                "sender must have Aborted. Marking "
                                "it as failed" % prev_snap_id
                            )
                            msg = "Sender process Aborted. See logs for " "more information"
                            data = {
                                "status": "failed",
                                "end_ts": now.strftime(settings.SNAP_TS_FORMAT),
                                "error": msg,
                                "send_failed": now,
                            }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif rt[0].status == "failed":
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            for rto in rt:
                                if rto.status != "failed" or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts:
                                    break
                                num_tries = num_tries + 1
                            if num_tries >= self.MAX_ATTEMPTS:
                                logger.info(
                                    "Maximum attempts(%d) reached "
                                    "for snap: %s. Disabling the "
                                    "replica." % (self.MAX_ATTEMPTS, snap_id)
                                )
                                disable_replica(r.id, logger)
                                continue
                            logger.info(
                                "previous backup failed for snap: "
                                "%s. Starting a new one. Attempt "
                                "%d/%d." % (snap_id, num_tries, self.MAX_ATTEMPTS)
                            )
                            prev_rt = None
                            for rto in rt:
                                if rto.status == "succeeded":
                                    prev_rt = rto
                                    break
                            sw = Sender(
                                r,
                                self.rep_ip,
                                self.pubq,
                                Queue(),
                                snap_name,
                                self.meta_port,
                                self.data_port,
                                r.meta_port,
                                self.uuid,
                                snap_id,
                                prev_rt,
                            )
                        else:
                            logger.error(
                                "unknown replica trail status: %s. " "ignoring snap: %s" % (rt[0].status, snap_id)
                            )
                            continue
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = "Error getting the list of enabled replica " "tasks. Moving on"
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1