class Application(BaseApplication, Monitor): """The storage node application.""" from smtplib import SMTP @classmethod def _buildOptionParser(cls): _ = cls.option_parser _.description = "NEO Admin node" cls.addCommonServerOptions('admin', '127.0.0.1:9999') hint = ' (the option can be repeated)' _ = _.group('admin') _.float( 'monitor-maxlag', default=float(Backup.max_lag), help='warn if a backup cluster is too late at replicating upstream' ) _('monitor-email', multiple=True, help='recipient email for notifications' + hint) _('monitor-backup', multiple=True, help='name of backup cluster to monitor' + hint) _('smtp', metavar='HOST[:PORT]', help='SMTP for email notifications') _.bool('smtp-tls', help='use STARTTLS') _('smtp-auth', metavar='USER:PASS', help='SMTP credentials') _.int('i', 'nid', help="specify an NID to use for this process (testing purpose)") def __init__(self, config): BaseApplication.__init__(self, config.get('ssl'), config.get('dynamic_master_list')) for address in config['masters']: self.nm.createMaster(address=address) self.name = config['cluster'] self.server = config['bind'] self.backup_dict = backup_dict = {} max_lag = config.get('monitor_maxlag', Backup.max_lag) for x in config.get('monitor_backup', ()): backup_dict[x] = x = Backup() x.max_lag = max_lag self.email_list = config.get('monitor_email', ()) if self.email_list: self.smtp_host = config.get('smtp') or 'localhost' self.smtp_tls = config.get('smtp_tls') if 'smtp_auth' in config: user, pwd = config['smtp_auth'].split(':', 1) self.smtp_login = user, pwd else: self.smtp_login = None email_from = os.getenv('EMAIL') if not email_from: try: email_from = getpass.getuser() except Exception: email_from = None self.email_from = formataddr(("NEO " + self.name, email_from)) self.smtp_exc = None self.smtp_retry = INF self.notifying = set() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.get('nid') logging.node(self.name, self.uuid) self.backup_handler = BackupHandler(self) self.master_event_handler = MasterEventHandler(self) self.upstream_admin_handler = UpstreamAdminHandler(self) self.cluster_state = None self.upstream_admin = self.upstream_admin_conn = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): Monitor.__init__(self) self.asking_monitor_information = [] self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server, backup=list(self.backup_dict)) self.master_node, self.master_conn = bootstrap.getPrimaryConnection() # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) def connectToUpstreamAdmin(self): if self.listening_conn: # if running self.upstream_admin_conn = None while True: conn = ClientConnection(self, self.upstream_admin_handler, self.upstream_admin) if not conn.isClosed(): break self.upstream_admin_conn = conn def partitionTableUpdated(self): pt = self.pt if pt: down_set = set() pt_summary = Counter() for offset in xrange(pt.np): for cell in pt.getCellList(offset): node = cell.getNode() if not node.isRunning(): down_set.add(node) pt_summary.update((cell.getState(), )) self.updateMonitorInformation(None, down=len(down_set), pt_summary=dict(pt_summary)) def askMonitorInformation(self, conn): asking = self.asking_monitor_information or self.notifying self.asking_monitor_information.append((conn, conn.getPeerId())) if not asking: self._notify(self.operational) def updateMonitorInformation(self, name, **kw): monitor = self if name is None else self.backup_dict[name] kw = {k: v for k, v in kw.iteritems() if v != getattr(monitor, k)} if not kw: return monitor.monitor_changed = True monitor.__dict__.update(kw) if name is None and self.upstream_admin_conn: self.upstream_admin_conn.send(Packets.NotifyMonitorInformation(kw)) if not self.notifying: self.em.setTimeout(None, None) self._notify(self.operational) def _notify(self, ask_ids=True): if ask_ids: self.askLastIds(self.master_conn) self.notifying = notifying = {None} for name, monitor in self.backup_dict.iteritems(): if monitor.operational: monitor.askLastIds(monitor.conn) notifying.add(name) if self.notifying or self.cluster_state is None is not self.master_conn: return severity = [], [], [] my_severity = self.severity severity[my_severity].append(self.name) changed = set() if self.monitor_changed: self.monitor_changed = False changed.add(self.name) if self.master_conn is None: body = NOT_CONNECTED_MESSAGE else: upstream, body = self.formatSummary() body = [body] for name, backup in self.backup_dict.iteritems(): body += '', name, ' ' + backup.formatSummary(upstream)[1] severity[backup.severity or backup.lagging].append(name) if backup.monitor_changed: backup.monitor_changed = False changed.add(name) body = '\n'.join(body) if changed or self.smtp_retry < time(): logging.debug('monitor notification') email_list = self.email_list while email_list: # not a loop msg = MIMEText(body + (self.smtp_exc or '')) msg['Date'] = formatdate() clusters, x = severity[1:] while 1: if x: clusters = clusters + x x = 'PROBLEM' elif clusters: x = 'WARNING' else: x = 'OK' break clusters = changed.intersection(clusters) if clusters: x += ' (%s)' % ', '.join(sorted(clusters)) break msg['Subject'] = 'NEO monitoring: ' + x msg['From'] = self.email_from msg['To'] = ', '.join(email_list) s = self.SMTP() try: s.connect(self.smtp_host) if self.smtp_tls: s.starttls() if self.smtp_login: s.login(*self.smtp_login) s.sendmail(None, email_list, msg.as_string()) except Exception: x = format_exc() logging.error(x) if changed or not self.smtp_exc: self.smtp_exc = ( "\n\nA notification could not be sent at %s:\n\n%s" % (msg['Date'], x)) retry = self.smtp_retry = time() + 600 else: self.smtp_exc = None self.smtp_retry = INF if not (self.operational and any( monitor.operational for monitor in self.backup_dict.itervalues())): break retry = time() + 600 finally: s.close() self.em.setTimeout(retry, self._notify) break neoctl = self.asking_monitor_information if neoctl: del severity[my_severity][0] if self.smtp_exc: my_severity = 2 body += self.smtp_exc severity[1].sort() severity[2].sort() severity[my_severity].insert(0, None) p = Packets.AnswerMonitorInformation(severity[1], severity[2], body) for conn, msg_id in neoctl: try: conn.send(p, msg_id) except ConnectionClosed: pass del self.asking_monitor_information[:] def maybeNotify(self, name): try: self.notifying.remove(name) except KeyError: return self._notify(False) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): pt = self.pt if max_offset == 0: max_offset = pt.getPartitions() try: row_list = map(pt.getRow, xrange(min_offset, max_offset)) except IndexError: conn.send(Errors.ProtocolError('invalid partition table offset')) else: conn.answer( Packets.AnswerPartitionList(pt.getID(), pt.getReplicas(), row_list))
class Application(BaseApplication): """The storage node application.""" def __init__(self, config): super(Application, self).__init__( config.getSSL(), config.getDynamicMasterList()) for address in config.getMasters(): self.nm.createMaster(address=address) self.name = config.getCluster() self.server = config.getBind() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.getUUID() self.request_handler = MasterRequestEventHandler(self) self.master_event_handler = MasterEventHandler(self) self.cluster_state = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): self.bootstrapped = False self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server) self.master_node, self.master_conn, num_partitions, num_replicas = \ bootstrap.getPrimaryConnection() if self.pt is None: self.pt = PartitionTable(num_partitions, num_replicas) elif self.pt.getPartitions() != num_partitions: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of partitions is inconsistent') elif self.pt.getReplicas() != num_replicas: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of replicas is inconsistent') # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) self.master_conn.ask(Packets.AskPartitionTable()) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): # we have a pt self.pt.log() row_list = [] if max_offset == 0: max_offset = self.pt.getPartitions() try: for offset in xrange(min_offset, max_offset): row = [] try: for cell in self.pt.getCellList(offset): if uuid is None or cell.getUUID() == uuid: row.append((cell.getUUID(), cell.getState())) except TypeError: pass row_list.append((offset, row)) except IndexError: conn.send(Errors.ProtocolError('invalid partition table offset')) else: conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))
class Application(BaseApplication): """The storage node application.""" def __init__(self, config): super(Application, self).__init__( config.getSSL(), config.getDynamicMasterList()) for address in config.getMasters(): self.nm.createMaster(address=address) self.name = config.getCluster() self.server = config.getBind() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.getUUID() self.request_handler = MasterRequestEventHandler(self) self.master_event_handler = MasterEventHandler(self) self.cluster_state = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): self.bootstrapped = False self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, self.name, NodeTypes.ADMIN, self.uuid, self.server) data = bootstrap.getPrimaryConnection() (node, conn, uuid, num_partitions, num_replicas) = data self.master_node = node self.master_conn = conn self.uuid = uuid if self.pt is None: self.pt = PartitionTable(num_partitions, num_replicas) elif self.pt.getPartitions() != num_partitions: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of partitions is inconsistent') elif self.pt.getReplicas() != num_replicas: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of replicas is inconsistent') # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) self.master_conn.ask(Packets.AskNodeInformation()) self.master_conn.ask(Packets.AskPartitionTable()) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): # we have a pt self.pt.log() row_list = [] if max_offset == 0: max_offset = self.pt.getPartitions() try: for offset in xrange(min_offset, max_offset): row = [] try: for cell in self.pt.getCellList(offset): if uuid is None or cell.getUUID() == uuid: row.append((cell.getUUID(), cell.getState())) except TypeError: pass row_list.append((offset, row)) except IndexError: conn.notify(Errors.ProtocolError('invalid partition table offset')) else: conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))