Ejemplo n.º 1
0
class Application(BaseApplication, Monitor):
    """The storage node application."""

    from smtplib import SMTP

    @classmethod
    def _buildOptionParser(cls):
        _ = cls.option_parser
        _.description = "NEO Admin node"
        cls.addCommonServerOptions('admin', '127.0.0.1:9999')

        hint = ' (the option can be repeated)'
        _ = _.group('admin')
        _.float(
            'monitor-maxlag',
            default=float(Backup.max_lag),
            help='warn if a backup cluster is too late at replicating upstream'
        )
        _('monitor-email',
          multiple=True,
          help='recipient email for notifications' + hint)
        _('monitor-backup',
          multiple=True,
          help='name of backup cluster to monitor' + hint)
        _('smtp', metavar='HOST[:PORT]', help='SMTP for email notifications')
        _.bool('smtp-tls', help='use STARTTLS')
        _('smtp-auth', metavar='USER:PASS', help='SMTP credentials')
        _.int('i',
              'nid',
              help="specify an NID to use for this process (testing purpose)")

    def __init__(self, config):
        BaseApplication.__init__(self, config.get('ssl'),
                                 config.get('dynamic_master_list'))
        for address in config['masters']:
            self.nm.createMaster(address=address)

        self.name = config['cluster']
        self.server = config['bind']

        self.backup_dict = backup_dict = {}
        max_lag = config.get('monitor_maxlag', Backup.max_lag)
        for x in config.get('monitor_backup', ()):
            backup_dict[x] = x = Backup()
            x.max_lag = max_lag
        self.email_list = config.get('monitor_email', ())
        if self.email_list:
            self.smtp_host = config.get('smtp') or 'localhost'
            self.smtp_tls = config.get('smtp_tls')
            if 'smtp_auth' in config:
                user, pwd = config['smtp_auth'].split(':', 1)
                self.smtp_login = user, pwd
            else:
                self.smtp_login = None
            email_from = os.getenv('EMAIL')
            if not email_from:
                try:
                    email_from = getpass.getuser()
                except Exception:
                    email_from = None
            self.email_from = formataddr(("NEO " + self.name, email_from))
        self.smtp_exc = None
        self.smtp_retry = INF
        self.notifying = set()

        logging.debug('IP address is %s, port is %d', *self.server)

        # The partition table is initialized after getting the number of
        # partitions.
        self.pt = None
        self.uuid = config.get('nid')
        logging.node(self.name, self.uuid)
        self.backup_handler = BackupHandler(self)
        self.master_event_handler = MasterEventHandler(self)
        self.upstream_admin_handler = UpstreamAdminHandler(self)
        self.cluster_state = None
        self.upstream_admin = self.upstream_admin_conn = None
        self.reset()
        registerLiveDebugger(on_log=self.log)

    def close(self):
        self.listening_conn = None
        super(Application, self).close()

    def reset(self):
        Monitor.__init__(self)
        self.asking_monitor_information = []
        self.master_conn = None
        self.master_node = None

    def log(self):
        self.em.log()
        self.nm.log()
        if self.pt is not None:
            self.pt.log()

    def run(self):
        try:
            self._run()
        except Exception:
            logging.exception('Pre-mortem data:')
            self.log()
            logging.flush()
            raise

    def _run(self):
        """Make sure that the status is sane and start a loop."""
        if len(self.name) == 0:
            raise RuntimeError, 'cluster name must be non-empty'

        # Make a listening port.
        handler = AdminEventHandler(self)
        self.listening_conn = ListeningConnection(self, handler, self.server)

        while self.cluster_state != ClusterStates.STOPPING:
            self.connectToPrimary()
            try:
                while True:
                    self.em.poll(1)
            except PrimaryFailure:
                logging.error('primary master is down')
        self.listening_conn.close()
        while not self.em.isIdle():
            self.em.poll(1)

    def connectToPrimary(self):
        """Find a primary master node, and connect to it.

        If a primary master node is not elected or ready, repeat
        the attempt of a connection periodically.

        Note that I do not accept any connection from non-master nodes
        at this stage.
        """
        self.cluster_state = None
        # search, find, connect and identify to the primary master
        bootstrap = BootstrapManager(self,
                                     NodeTypes.ADMIN,
                                     self.server,
                                     backup=list(self.backup_dict))
        self.master_node, self.master_conn = bootstrap.getPrimaryConnection()

        # passive handler
        self.master_conn.setHandler(self.master_event_handler)
        self.master_conn.ask(Packets.AskClusterState())

    def connectToUpstreamAdmin(self):
        if self.listening_conn:  # if running
            self.upstream_admin_conn = None
            while True:
                conn = ClientConnection(self, self.upstream_admin_handler,
                                        self.upstream_admin)
                if not conn.isClosed():
                    break
            self.upstream_admin_conn = conn

    def partitionTableUpdated(self):
        pt = self.pt
        if pt:
            down_set = set()
            pt_summary = Counter()
            for offset in xrange(pt.np):
                for cell in pt.getCellList(offset):
                    node = cell.getNode()
                    if not node.isRunning():
                        down_set.add(node)
                    pt_summary.update((cell.getState(), ))
            self.updateMonitorInformation(None,
                                          down=len(down_set),
                                          pt_summary=dict(pt_summary))

    def askMonitorInformation(self, conn):
        asking = self.asking_monitor_information or self.notifying
        self.asking_monitor_information.append((conn, conn.getPeerId()))
        if not asking:
            self._notify(self.operational)

    def updateMonitorInformation(self, name, **kw):
        monitor = self if name is None else self.backup_dict[name]
        kw = {k: v for k, v in kw.iteritems() if v != getattr(monitor, k)}
        if not kw:
            return
        monitor.monitor_changed = True
        monitor.__dict__.update(kw)
        if name is None and self.upstream_admin_conn:
            self.upstream_admin_conn.send(Packets.NotifyMonitorInformation(kw))
        if not self.notifying:
            self.em.setTimeout(None, None)
            self._notify(self.operational)

    def _notify(self, ask_ids=True):
        if ask_ids:
            self.askLastIds(self.master_conn)
            self.notifying = notifying = {None}
            for name, monitor in self.backup_dict.iteritems():
                if monitor.operational:
                    monitor.askLastIds(monitor.conn)
                    notifying.add(name)
        if self.notifying or self.cluster_state is None is not self.master_conn:
            return
        severity = [], [], []
        my_severity = self.severity
        severity[my_severity].append(self.name)
        changed = set()
        if self.monitor_changed:
            self.monitor_changed = False
            changed.add(self.name)
        if self.master_conn is None:
            body = NOT_CONNECTED_MESSAGE
        else:
            upstream, body = self.formatSummary()
            body = [body]
            for name, backup in self.backup_dict.iteritems():
                body += '', name, '    ' + backup.formatSummary(upstream)[1]
                severity[backup.severity or backup.lagging].append(name)
                if backup.monitor_changed:
                    backup.monitor_changed = False
                    changed.add(name)
            body = '\n'.join(body)
        if changed or self.smtp_retry < time():
            logging.debug('monitor notification')
            email_list = self.email_list
            while email_list:  # not a loop
                msg = MIMEText(body + (self.smtp_exc or ''))
                msg['Date'] = formatdate()
                clusters, x = severity[1:]
                while 1:
                    if x:
                        clusters = clusters + x
                        x = 'PROBLEM'
                    elif clusters:
                        x = 'WARNING'
                    else:
                        x = 'OK'
                        break
                    clusters = changed.intersection(clusters)
                    if clusters:
                        x += ' (%s)' % ', '.join(sorted(clusters))
                    break
                msg['Subject'] = 'NEO monitoring: ' + x
                msg['From'] = self.email_from
                msg['To'] = ', '.join(email_list)
                s = self.SMTP()
                try:
                    s.connect(self.smtp_host)
                    if self.smtp_tls:
                        s.starttls()
                    if self.smtp_login:
                        s.login(*self.smtp_login)
                    s.sendmail(None, email_list, msg.as_string())
                except Exception:
                    x = format_exc()
                    logging.error(x)
                    if changed or not self.smtp_exc:
                        self.smtp_exc = (
                            "\n\nA notification could not be sent at %s:\n\n%s"
                            % (msg['Date'], x))
                    retry = self.smtp_retry = time() + 600
                else:
                    self.smtp_exc = None
                    self.smtp_retry = INF
                    if not (self.operational and any(
                            monitor.operational
                            for monitor in self.backup_dict.itervalues())):
                        break
                    retry = time() + 600
                finally:
                    s.close()
                self.em.setTimeout(retry, self._notify)
                break
        neoctl = self.asking_monitor_information
        if neoctl:
            del severity[my_severity][0]
            if self.smtp_exc:
                my_severity = 2
                body += self.smtp_exc
            severity[1].sort()
            severity[2].sort()
            severity[my_severity].insert(0, None)
            p = Packets.AnswerMonitorInformation(severity[1], severity[2],
                                                 body)
            for conn, msg_id in neoctl:
                try:
                    conn.send(p, msg_id)
                except ConnectionClosed:
                    pass
            del self.asking_monitor_information[:]

    def maybeNotify(self, name):
        try:
            self.notifying.remove(name)
        except KeyError:
            return
        self._notify(False)

    def sendPartitionTable(self, conn, min_offset, max_offset, uuid):
        pt = self.pt
        if max_offset == 0:
            max_offset = pt.getPartitions()
        try:
            row_list = map(pt.getRow, xrange(min_offset, max_offset))
        except IndexError:
            conn.send(Errors.ProtocolError('invalid partition table offset'))
        else:
            conn.answer(
                Packets.AnswerPartitionList(pt.getID(), pt.getReplicas(),
                                            row_list))
Ejemplo n.º 2
0
Archivo: app.py Proyecto: pyzh/neoppod
class Application(BaseApplication):
    """The storage node application."""

    def __init__(self, config):
        super(Application, self).__init__(
            config.getSSL(), config.getDynamicMasterList())
        for address in config.getMasters():
            self.nm.createMaster(address=address)

        self.name = config.getCluster()
        self.server = config.getBind()

        logging.debug('IP address is %s, port is %d', *self.server)

        # The partition table is initialized after getting the number of
        # partitions.
        self.pt = None
        self.uuid = config.getUUID()
        self.request_handler = MasterRequestEventHandler(self)
        self.master_event_handler = MasterEventHandler(self)
        self.cluster_state = None
        self.reset()
        registerLiveDebugger(on_log=self.log)

    def close(self):
        self.listening_conn = None
        super(Application, self).close()

    def reset(self):
        self.bootstrapped = False
        self.master_conn = None
        self.master_node = None

    def log(self):
        self.em.log()
        self.nm.log()
        if self.pt is not None:
            self.pt.log()

    def run(self):
        try:
            self._run()
        except Exception:
            logging.exception('Pre-mortem data:')
            self.log()
            logging.flush()
            raise

    def _run(self):
        """Make sure that the status is sane and start a loop."""
        if len(self.name) == 0:
            raise RuntimeError, 'cluster name must be non-empty'

        # Make a listening port.
        handler = AdminEventHandler(self)
        self.listening_conn = ListeningConnection(self, handler, self.server)

        while self.cluster_state != ClusterStates.STOPPING:
            self.connectToPrimary()
            try:
                while True:
                    self.em.poll(1)
            except PrimaryFailure:
                logging.error('primary master is down')
        self.listening_conn.close()
        while not self.em.isIdle():
            self.em.poll(1)

    def connectToPrimary(self):
        """Find a primary master node, and connect to it.

        If a primary master node is not elected or ready, repeat
        the attempt of a connection periodically.

        Note that I do not accept any connection from non-master nodes
        at this stage.
        """
        self.cluster_state = None
        # search, find, connect and identify to the primary master
        bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server)
        self.master_node, self.master_conn, num_partitions, num_replicas = \
            bootstrap.getPrimaryConnection()

        if self.pt is None:
            self.pt = PartitionTable(num_partitions, num_replicas)
        elif self.pt.getPartitions() != num_partitions:
            # XXX: shouldn't we recover instead of raising ?
            raise RuntimeError('the number of partitions is inconsistent')
        elif self.pt.getReplicas() != num_replicas:
            # XXX: shouldn't we recover instead of raising ?
            raise RuntimeError('the number of replicas is inconsistent')

        # passive handler
        self.master_conn.setHandler(self.master_event_handler)
        self.master_conn.ask(Packets.AskClusterState())
        self.master_conn.ask(Packets.AskPartitionTable())

    def sendPartitionTable(self, conn, min_offset, max_offset, uuid):
        # we have a pt
        self.pt.log()
        row_list = []
        if max_offset == 0:
            max_offset = self.pt.getPartitions()
        try:
            for offset in xrange(min_offset, max_offset):
                row = []
                try:
                    for cell in self.pt.getCellList(offset):
                        if uuid is None or cell.getUUID() == uuid:
                            row.append((cell.getUUID(), cell.getState()))
                except TypeError:
                    pass
                row_list.append((offset, row))
        except IndexError:
            conn.send(Errors.ProtocolError('invalid partition table offset'))
        else:
            conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))
Ejemplo n.º 3
0
class Application(BaseApplication):
    """The storage node application."""

    def __init__(self, config):
        super(Application, self).__init__(
            config.getSSL(), config.getDynamicMasterList())
        for address in config.getMasters():
            self.nm.createMaster(address=address)

        self.name = config.getCluster()
        self.server = config.getBind()

        logging.debug('IP address is %s, port is %d', *self.server)

        # The partition table is initialized after getting the number of
        # partitions.
        self.pt = None
        self.uuid = config.getUUID()
        self.request_handler = MasterRequestEventHandler(self)
        self.master_event_handler = MasterEventHandler(self)
        self.cluster_state = None
        self.reset()
        registerLiveDebugger(on_log=self.log)

    def close(self):
        self.listening_conn = None
        super(Application, self).close()

    def reset(self):
        self.bootstrapped = False
        self.master_conn = None
        self.master_node = None

    def log(self):
        self.em.log()
        self.nm.log()
        if self.pt is not None:
            self.pt.log()

    def run(self):
        try:
            self._run()
        except Exception:
            logging.exception('Pre-mortem data:')
            self.log()
            logging.flush()
            raise

    def _run(self):
        """Make sure that the status is sane and start a loop."""
        if len(self.name) == 0:
            raise RuntimeError, 'cluster name must be non-empty'

        # Make a listening port.
        handler = AdminEventHandler(self)
        self.listening_conn = ListeningConnection(self, handler, self.server)

        while self.cluster_state != ClusterStates.STOPPING:
            self.connectToPrimary()
            try:
                while True:
                    self.em.poll(1)
            except PrimaryFailure:
                logging.error('primary master is down')
        self.listening_conn.close()
        while not self.em.isIdle():
            self.em.poll(1)

    def connectToPrimary(self):
        """Find a primary master node, and connect to it.

        If a primary master node is not elected or ready, repeat
        the attempt of a connection periodically.

        Note that I do not accept any connection from non-master nodes
        at this stage.
        """
        self.cluster_state = None
        # search, find, connect and identify to the primary master
        bootstrap = BootstrapManager(self, self.name, NodeTypes.ADMIN,
                self.uuid, self.server)
        data = bootstrap.getPrimaryConnection()
        (node, conn, uuid, num_partitions, num_replicas) = data
        self.master_node = node
        self.master_conn = conn
        self.uuid = uuid

        if self.pt is None:
            self.pt = PartitionTable(num_partitions, num_replicas)
        elif self.pt.getPartitions() != num_partitions:
            # XXX: shouldn't we recover instead of raising ?
            raise RuntimeError('the number of partitions is inconsistent')
        elif self.pt.getReplicas() != num_replicas:
            # XXX: shouldn't we recover instead of raising ?
            raise RuntimeError('the number of replicas is inconsistent')

        # passive handler
        self.master_conn.setHandler(self.master_event_handler)
        self.master_conn.ask(Packets.AskClusterState())
        self.master_conn.ask(Packets.AskNodeInformation())
        self.master_conn.ask(Packets.AskPartitionTable())

    def sendPartitionTable(self, conn, min_offset, max_offset, uuid):
        # we have a pt
        self.pt.log()
        row_list = []
        if max_offset == 0:
            max_offset = self.pt.getPartitions()
        try:
            for offset in xrange(min_offset, max_offset):
                row = []
                try:
                    for cell in self.pt.getCellList(offset):
                        if uuid is None or cell.getUUID() == uuid:
                            row.append((cell.getUUID(), cell.getState()))
                except TypeError:
                    pass
                row_list.append((offset, row))
        except IndexError:
            conn.notify(Errors.ProtocolError('invalid partition table offset'))
        else:
            conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))