def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port handler = identification.IdentificationHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) self.server = self.listening_conn.getAddress() # Connect to a primary master node, verify data, and # start the operation. This cycle will be executed permanently, # until the user explicitly requests a shutdown. self.operational = False while True: self.cluster_state = None if self.master_node is None: # look for the primary master self.connectToPrimary() self.checker = Checker(self) self.replicator = Replicator(self) self.tm = TransactionManager(self) try: self.initialize() self.doOperation() raise RuntimeError, 'should not reach here' except StoppedOperation, msg: logging.error('operation stopped: %s', msg) except PrimaryFailure, msg: logging.error('primary master is down: %s', msg)
def getLoopbackConnection(self): app = MasterApplication(address=BIND, getSSL=NEOCluster.SSL, getReplicas=0, getPartitions=1) try: handler = EventHandler(app) app.listening_conn = ListeningConnection(app, handler, app.server) yield ClientConnection(app, handler, app.nm.createMaster( address=app.listening_conn.getAddress(), uuid=app.uuid)) finally: app.close()
def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1)
def _run(self): """Make sure that the status is sane and start a loop.""" # Make a listening port. self.listening_conn = ListeningConnection(self, None, self.server) # Start a normal operation. while self.cluster_state != ClusterStates.STOPPING: # (Re)elect a new primary master. self.primary = not self.nm.getMasterList() if not self.primary: self.electPrimary() try: if self.primary: self.playPrimaryRole() else: self.playSecondaryRole() raise RuntimeError, 'should not reach here' except (ElectionFailure, PrimaryFailure): # Forget all connections. for conn in self.em.getClientList(): conn.close()
def getLoopbackConnection(self): app = MasterApplication(getSSL=NEOCluster.SSL, getReplicas=0, getPartitions=1) handler = EventHandler(app) app.listening_conn = ListeningConnection(app, handler, app.server) node = app.nm.createMaster(address=app.listening_conn.getAddress(), uuid=app.uuid) conn = ClientConnection.__new__(ClientConnection) def reset(): conn.__dict__.clear() conn.__init__(app, handler, node) conn.reset = reset reset() return conn
class Application(BaseApplication): """The master node application.""" packing = None # Latest completely commited TID last_transaction = ZERO_TID backup_tid = None backup_app = None uuid = None truncate_tid = None def __init__(self, config): super(Application, self).__init__( config.getSSL(), config.getDynamicMasterList()) self.tm = TransactionManager(self.onTransactionCommitted) self.name = config.getCluster() self.server = config.getBind() self.autostart = config.getAutostart() self.storage_readiness = set() for master_address in config.getMasters(): self.nm.createMaster(address=master_address) logging.debug('IP address is %s, port is %d', *self.server) # Partition table replicas, partitions = config.getReplicas(), config.getPartitions() if replicas < 0: raise RuntimeError, 'replicas must be a positive integer' if partitions <= 0: raise RuntimeError, 'partitions must be more than zero' self.pt = PartitionTable(partitions, replicas) logging.info('Configuration:') logging.info('Partitions: %d', partitions) logging.info('Replicas : %d', replicas) logging.info('Name : %s', self.name) self.listening_conn = None self.primary = None self.primary_master_node = None self.cluster_state = None uuid = config.getUUID() if uuid: self.uuid = uuid # election related data self.unconnected_master_node_set = set() self.negotiating_master_node_set = set() self.master_address_dict = weakref.WeakKeyDictionary() self._current_manager = None # backup upstream_cluster = config.getUpstreamCluster() if upstream_cluster: if upstream_cluster == self.name: raise ValueError("upstream cluster name must be" " different from cluster name") self.backup_app = BackupApplication(self, upstream_cluster, config.getUpstreamMasters()) self.administration_handler = administration.AdministrationHandler( self) self.secondary_master_handler = secondary.SecondaryMasterHandler(self) self.client_service_handler = client.ClientServiceHandler(self) self.storage_service_handler = storage.StorageServiceHandler(self) registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None if self.backup_app is not None: self.backup_app.close() super(Application, self).close() def log(self): self.em.log() if self.backup_app is not None: self.backup_app.log() self.nm.log() self.tm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" # Make a listening port. self.listening_conn = ListeningConnection(self, None, self.server) # Start a normal operation. while self.cluster_state != ClusterStates.STOPPING: # (Re)elect a new primary master. self.primary = not self.nm.getMasterList() if not self.primary: self.electPrimary() try: if self.primary: self.playPrimaryRole() else: self.playSecondaryRole() raise RuntimeError, 'should not reach here' except (ElectionFailure, PrimaryFailure): # Forget all connections. for conn in self.em.getClientList(): conn.close() def electPrimary(self): """Elect a primary master node. The difficulty is that a master node must accept connections from others while attempting to connect to other master nodes at the same time. Note that storage nodes and client nodes may connect to self as well as master nodes.""" logging.info('begin the election of a primary master') client_handler = election.ClientElectionHandler(self) self.unconnected_master_node_set.clear() self.negotiating_master_node_set.clear() self.master_address_dict.clear() self.listening_conn.setHandler(election.ServerElectionHandler(self)) getByAddress = self.nm.getByAddress while True: # handle new connected masters for node in self.nm.getMasterList(): node.setUnknown() self.unconnected_master_node_set.add(node.getAddress()) # start the election process self.primary = None self.primary_master_node = None try: while (self.unconnected_master_node_set or self.negotiating_master_node_set): for addr in self.unconnected_master_node_set: self.negotiating_master_node_set.add(addr) ClientConnection(self, client_handler, # XXX: Ugly, but the whole election code will be # replaced soon getByAddress(addr)) self.unconnected_master_node_set.clear() self.em.poll(1) except ElectionFailure, m: # something goes wrong, clean then restart logging.error('election failed: %s', m) # Ask all connected nodes to reelect a single primary master. for conn in self.em.getClientList(): conn.notify(Packets.ReelectPrimary()) conn.abort() # Wait until the connections are closed. self.primary = None self.primary_master_node = None # XXX: Since poll does not wake up anymore every second, # the following time condition should be reviewed. # See also playSecondaryRole. t = time() + 10 while self.em.getClientList() and time() < t: try: self.em.poll(1) except ElectionFailure: pass # Close all connections. for conn in self.em.getClientList() + self.em.getServerList(): conn.close() else: # election succeed, stop the process self.primary = self.primary is None break
class Application(BaseApplication): """The master node application.""" packing = None # Latest completely commited TID last_transaction = ZERO_TID backup_tid = None backup_app = None uuid = None truncate_tid = None def __init__(self, config): super(Application, self).__init__(config.getSSL(), config.getDynamicMasterList()) self.tm = TransactionManager(self.onTransactionCommitted) self.name = config.getCluster() self.server = config.getBind() self.autostart = config.getAutostart() self.storage_readiness = set() for master_address in config.getMasters(): self.nm.createMaster(address=master_address) logging.debug('IP address is %s, port is %d', *self.server) # Partition table replicas, partitions = config.getReplicas(), config.getPartitions() if replicas < 0: raise RuntimeError, 'replicas must be a positive integer' if partitions <= 0: raise RuntimeError, 'partitions must be more than zero' self.pt = PartitionTable(partitions, replicas) logging.info('Configuration:') logging.info('Partitions: %d', partitions) logging.info('Replicas : %d', replicas) logging.info('Name : %s', self.name) self.listening_conn = None self.primary = None self.primary_master_node = None self.cluster_state = None uuid = config.getUUID() if uuid: self.uuid = uuid # election related data self.unconnected_master_node_set = set() self.negotiating_master_node_set = set() self.master_address_dict = weakref.WeakKeyDictionary() self._current_manager = None # backup upstream_cluster = config.getUpstreamCluster() if upstream_cluster: if upstream_cluster == self.name: raise ValueError("upstream cluster name must be" " different from cluster name") self.backup_app = BackupApplication(self, upstream_cluster, config.getUpstreamMasters()) self.administration_handler = administration.AdministrationHandler( self) self.secondary_master_handler = secondary.SecondaryMasterHandler(self) self.client_service_handler = client.ClientServiceHandler(self) self.storage_service_handler = storage.StorageServiceHandler(self) registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None if self.backup_app is not None: self.backup_app.close() super(Application, self).close() def log(self): self.em.log() if self.backup_app is not None: self.backup_app.log() self.nm.log() self.tm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" # Make a listening port. self.listening_conn = ListeningConnection(self, None, self.server) # Start a normal operation. while self.cluster_state != ClusterStates.STOPPING: # (Re)elect a new primary master. self.primary = not self.nm.getMasterList() if not self.primary: self.electPrimary() try: if self.primary: self.playPrimaryRole() else: self.playSecondaryRole() raise RuntimeError, 'should not reach here' except (ElectionFailure, PrimaryFailure): # Forget all connections. for conn in self.em.getClientList(): conn.close() def electPrimary(self): """Elect a primary master node. The difficulty is that a master node must accept connections from others while attempting to connect to other master nodes at the same time. Note that storage nodes and client nodes may connect to self as well as master nodes.""" logging.info('begin the election of a primary master') client_handler = election.ClientElectionHandler(self) self.unconnected_master_node_set.clear() self.negotiating_master_node_set.clear() self.master_address_dict.clear() self.listening_conn.setHandler(election.ServerElectionHandler(self)) getByAddress = self.nm.getByAddress while True: # handle new connected masters for node in self.nm.getMasterList(): node.setUnknown() self.unconnected_master_node_set.add(node.getAddress()) # start the election process self.primary = None self.primary_master_node = None try: while (self.unconnected_master_node_set or self.negotiating_master_node_set): for addr in self.unconnected_master_node_set: self.negotiating_master_node_set.add(addr) ClientConnection( self, client_handler, # XXX: Ugly, but the whole election code will be # replaced soon getByAddress(addr)) self.unconnected_master_node_set.clear() self.em.poll(1) except ElectionFailure, m: # something goes wrong, clean then restart logging.error('election failed: %s', m) # Ask all connected nodes to reelect a single primary master. for conn in self.em.getClientList(): conn.notify(Packets.ReelectPrimary()) conn.abort() # Wait until the connections are closed. self.primary = None self.primary_master_node = None # XXX: Since poll does not wake up anymore every second, # the following time condition should be reviewed. # See also playSecondaryRole. t = time() + 10 while self.em.getClientList() and time() < t: try: self.em.poll(1) except ElectionFailure: pass # Close all connections. for conn in self.em.getClientList() + self.em.getServerList(): conn.close() else: # election succeed, stop the process self.primary = self.primary is None break
class Application(BaseApplication): """The storage node application.""" def __init__(self, config): super(Application, self).__init__( config.getSSL(), config.getDynamicMasterList()) for address in config.getMasters(): self.nm.createMaster(address=address) self.name = config.getCluster() self.server = config.getBind() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.getUUID() self.request_handler = MasterRequestEventHandler(self) self.master_event_handler = MasterEventHandler(self) self.cluster_state = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): self.bootstrapped = False self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server) self.master_node, self.master_conn, num_partitions, num_replicas = \ bootstrap.getPrimaryConnection() if self.pt is None: self.pt = PartitionTable(num_partitions, num_replicas) elif self.pt.getPartitions() != num_partitions: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of partitions is inconsistent') elif self.pt.getReplicas() != num_replicas: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of replicas is inconsistent') # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) self.master_conn.ask(Packets.AskPartitionTable()) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): # we have a pt self.pt.log() row_list = [] if max_offset == 0: max_offset = self.pt.getPartitions() try: for offset in xrange(min_offset, max_offset): row = [] try: for cell in self.pt.getCellList(offset): if uuid is None or cell.getUUID() == uuid: row.append((cell.getUUID(), cell.getState())) except TypeError: pass row_list.append((offset, row)) except IndexError: conn.send(Errors.ProtocolError('invalid partition table offset')) else: conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))
def _run(self): self.listening_conn = ListeningConnection(self, None, self.server) while True: self.playPrimaryRole() self.playSecondaryRole()
class Application(BaseApplication): """The storage node application.""" checker = replicator = tm = None @classmethod def _buildOptionParser(cls): parser = cls.option_parser parser.description = "NEO Storage node" cls.addCommonServerOptions('storage', '127.0.0.1') _ = parser.group('storage') _('a', 'adapter', choices=sorted(DATABASE_MANAGER_DICT), help="database adapter to use") _('d', 'database', required=True, help="database connections string") _.float('w', 'wait', help="seconds to wait for backend to be available," " before erroring-out (-1 = infinite)") _.bool('disable-drop-partitions', help="do not delete data of discarded cells, which is useful for" " big databases because the current implementation is" " inefficient (this option should disappear in the future)") _.bool('new-nid', help="request a new NID from a cluster that is already" " operational, update the database with the new NID and exit," " which makes easier to quickly set up a replica by copying" " the database of another node while it was stopped") _ = parser.group('database creation') _.int('i', 'nid', help="specify an NID to use for this process. Previously" " assigned NID takes precedence (i.e. you should" " always use reset with this switch)") _('e', 'engine', help="database engine (MySQL only)") _.bool('dedup', help="enable deduplication of data" " when setting up a new storage node") # TODO: Forbid using "reset" along with any unneeded argument. # "reset" is too dangerous to let user a chance of accidentally # letting it slip through in a long option list. # It should even be forbidden in configuration files. _.bool('reset', help="remove an existing database if any, and exit") parser.set_defaults(**option_defaults) def __init__(self, config): super(Application, self).__init__( config.get('ssl'), config.get('dynamic_master_list')) # set the cluster name self.name = config['cluster'] self.dm = buildDatabaseManager(config['adapter'], (config['database'], config.get('engine'), config['wait']), ) self.disable_drop_partitions = config.get('disable_drop_partitions', False) # load master nodes for master_address in config['masters']: self.nm.createMaster(address=master_address) # set the bind address self.server = config['bind'] logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.listening_conn = None self.master_conn = None self.master_node = None # operation related data self.operational = False self.dm.setup(reset=config.get('reset', False), dedup=config.get('dedup', False)) self.loadConfiguration() self.devpath = self.dm.getTopologyPath() if config.get('new_nid'): self.new_nid = [x[0] for x in self.dm.iterAssignedCells()] if not self.new_nid: sys.exit('database is empty') self.uuid = None else: self.new_nid = () if 'nid' in config: # for testing purpose only self.uuid = config['nid'] logging.node(self.name, self.uuid) registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None self.dm.close() super(Application, self).close() def _poll(self): self.em.poll(1) def log(self): self.em.log() self.nm.log() if self.tm: self.tm.log() if self.pt is not None: self.pt.log() def loadConfiguration(self): """Load persistent configuration data from the database. If data is not present, generate it.""" dm = self.dm # check cluster name name = dm.getName() if name is None: dm.setName(self.name) elif name != self.name: raise RuntimeError('name %r does not match with the database: %r' % (self.name, name)) # load configuration self.uuid = dm.getUUID() logging.node(self.name, self.uuid) logging.info('Configuration loaded:') logging.info('PTID : %s', dump(dm.getPTID())) logging.info('Name : %s', self.name) def loadPartitionTable(self): """Load a partition table from the database.""" ptid = self.dm.getPTID() if ptid is None: self.pt = PartitionTable(0, 0) return row_list = [] for offset, uuid, state in self.dm.getPartitionTable(): while len(row_list) <= offset: row_list.append([]) # register unknown nodes if self.nm.getByUUID(uuid) is None: self.nm.createStorage(uuid=uuid) row_list[offset].append((uuid, CellStates[state])) self.pt = object.__new__(PartitionTable) self.pt.load(ptid, self.dm.getNumReplicas(), row_list, self.nm) def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port handler = identification.IdentificationHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) self.server = self.listening_conn.getAddress() # Connect to a primary master node, verify data, and # start the operation. This cycle will be executed permanently, # until the user explicitly requests a shutdown. self.operational = False while True: self.cluster_state = None if self.master_node is None: # look for the primary master self.connectToPrimary() self.checker = Checker(self) self.replicator = Replicator(self) self.tm = TransactionManager(self) try: self.initialize() self.doOperation() raise RuntimeError, 'should not reach here' except StoppedOperation, msg: logging.error('operation stopped: %s', msg) except PrimaryFailure, msg: logging.error('primary master is down: %s', msg) finally:
class Application(BaseApplication, Monitor): """The storage node application.""" from smtplib import SMTP @classmethod def _buildOptionParser(cls): _ = cls.option_parser _.description = "NEO Admin node" cls.addCommonServerOptions('admin', '127.0.0.1:9999') hint = ' (the option can be repeated)' _ = _.group('admin') _.float( 'monitor-maxlag', default=float(Backup.max_lag), help='warn if a backup cluster is too late at replicating upstream' ) _('monitor-email', multiple=True, help='recipient email for notifications' + hint) _('monitor-backup', multiple=True, help='name of backup cluster to monitor' + hint) _('smtp', metavar='HOST[:PORT]', help='SMTP for email notifications') _.bool('smtp-tls', help='use STARTTLS') _('smtp-auth', metavar='USER:PASS', help='SMTP credentials') _.int('i', 'nid', help="specify an NID to use for this process (testing purpose)") def __init__(self, config): BaseApplication.__init__(self, config.get('ssl'), config.get('dynamic_master_list')) for address in config['masters']: self.nm.createMaster(address=address) self.name = config['cluster'] self.server = config['bind'] self.backup_dict = backup_dict = {} max_lag = config.get('monitor_maxlag', Backup.max_lag) for x in config.get('monitor_backup', ()): backup_dict[x] = x = Backup() x.max_lag = max_lag self.email_list = config.get('monitor_email', ()) if self.email_list: self.smtp_host = config.get('smtp') or 'localhost' self.smtp_tls = config.get('smtp_tls') if 'smtp_auth' in config: user, pwd = config['smtp_auth'].split(':', 1) self.smtp_login = user, pwd else: self.smtp_login = None email_from = os.getenv('EMAIL') if not email_from: try: email_from = getpass.getuser() except Exception: email_from = None self.email_from = formataddr(("NEO " + self.name, email_from)) self.smtp_exc = None self.smtp_retry = INF self.notifying = set() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.get('nid') logging.node(self.name, self.uuid) self.backup_handler = BackupHandler(self) self.master_event_handler = MasterEventHandler(self) self.upstream_admin_handler = UpstreamAdminHandler(self) self.cluster_state = None self.upstream_admin = self.upstream_admin_conn = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): Monitor.__init__(self) self.asking_monitor_information = [] self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server, backup=list(self.backup_dict)) self.master_node, self.master_conn = bootstrap.getPrimaryConnection() # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) def connectToUpstreamAdmin(self): if self.listening_conn: # if running self.upstream_admin_conn = None while True: conn = ClientConnection(self, self.upstream_admin_handler, self.upstream_admin) if not conn.isClosed(): break self.upstream_admin_conn = conn def partitionTableUpdated(self): pt = self.pt if pt: down_set = set() pt_summary = Counter() for offset in xrange(pt.np): for cell in pt.getCellList(offset): node = cell.getNode() if not node.isRunning(): down_set.add(node) pt_summary.update((cell.getState(), )) self.updateMonitorInformation(None, down=len(down_set), pt_summary=dict(pt_summary)) def askMonitorInformation(self, conn): asking = self.asking_monitor_information or self.notifying self.asking_monitor_information.append((conn, conn.getPeerId())) if not asking: self._notify(self.operational) def updateMonitorInformation(self, name, **kw): monitor = self if name is None else self.backup_dict[name] kw = {k: v for k, v in kw.iteritems() if v != getattr(monitor, k)} if not kw: return monitor.monitor_changed = True monitor.__dict__.update(kw) if name is None and self.upstream_admin_conn: self.upstream_admin_conn.send(Packets.NotifyMonitorInformation(kw)) if not self.notifying: self.em.setTimeout(None, None) self._notify(self.operational) def _notify(self, ask_ids=True): if ask_ids: self.askLastIds(self.master_conn) self.notifying = notifying = {None} for name, monitor in self.backup_dict.iteritems(): if monitor.operational: monitor.askLastIds(monitor.conn) notifying.add(name) if self.notifying or self.cluster_state is None is not self.master_conn: return severity = [], [], [] my_severity = self.severity severity[my_severity].append(self.name) changed = set() if self.monitor_changed: self.monitor_changed = False changed.add(self.name) if self.master_conn is None: body = NOT_CONNECTED_MESSAGE else: upstream, body = self.formatSummary() body = [body] for name, backup in self.backup_dict.iteritems(): body += '', name, ' ' + backup.formatSummary(upstream)[1] severity[backup.severity or backup.lagging].append(name) if backup.monitor_changed: backup.monitor_changed = False changed.add(name) body = '\n'.join(body) if changed or self.smtp_retry < time(): logging.debug('monitor notification') email_list = self.email_list while email_list: # not a loop msg = MIMEText(body + (self.smtp_exc or '')) msg['Date'] = formatdate() clusters, x = severity[1:] while 1: if x: clusters = clusters + x x = 'PROBLEM' elif clusters: x = 'WARNING' else: x = 'OK' break clusters = changed.intersection(clusters) if clusters: x += ' (%s)' % ', '.join(sorted(clusters)) break msg['Subject'] = 'NEO monitoring: ' + x msg['From'] = self.email_from msg['To'] = ', '.join(email_list) s = self.SMTP() try: s.connect(self.smtp_host) if self.smtp_tls: s.starttls() if self.smtp_login: s.login(*self.smtp_login) s.sendmail(None, email_list, msg.as_string()) except Exception: x = format_exc() logging.error(x) if changed or not self.smtp_exc: self.smtp_exc = ( "\n\nA notification could not be sent at %s:\n\n%s" % (msg['Date'], x)) retry = self.smtp_retry = time() + 600 else: self.smtp_exc = None self.smtp_retry = INF if not (self.operational and any( monitor.operational for monitor in self.backup_dict.itervalues())): break retry = time() + 600 finally: s.close() self.em.setTimeout(retry, self._notify) break neoctl = self.asking_monitor_information if neoctl: del severity[my_severity][0] if self.smtp_exc: my_severity = 2 body += self.smtp_exc severity[1].sort() severity[2].sort() severity[my_severity].insert(0, None) p = Packets.AnswerMonitorInformation(severity[1], severity[2], body) for conn, msg_id in neoctl: try: conn.send(p, msg_id) except ConnectionClosed: pass del self.asking_monitor_information[:] def maybeNotify(self, name): try: self.notifying.remove(name) except KeyError: return self._notify(False) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): pt = self.pt if max_offset == 0: max_offset = pt.getPartitions() try: row_list = map(pt.getRow, xrange(min_offset, max_offset)) except IndexError: conn.send(Errors.ProtocolError('invalid partition table offset')) else: conn.answer( Packets.AnswerPartitionList(pt.getID(), pt.getReplicas(), row_list))
class Application(BaseApplication): """The storage node application.""" def __init__(self, config): super(Application, self).__init__( config.getSSL(), config.getDynamicMasterList()) for address in config.getMasters(): self.nm.createMaster(address=address) self.name = config.getCluster() self.server = config.getBind() logging.debug('IP address is %s, port is %d', *self.server) # The partition table is initialized after getting the number of # partitions. self.pt = None self.uuid = config.getUUID() self.request_handler = MasterRequestEventHandler(self) self.master_event_handler = MasterEventHandler(self) self.cluster_state = None self.reset() registerLiveDebugger(on_log=self.log) def close(self): self.listening_conn = None super(Application, self).close() def reset(self): self.bootstrapped = False self.master_conn = None self.master_node = None def log(self): self.em.log() self.nm.log() if self.pt is not None: self.pt.log() def run(self): try: self._run() except Exception: logging.exception('Pre-mortem data:') self.log() logging.flush() raise def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1) def connectToPrimary(self): """Find a primary master node, and connect to it. If a primary master node is not elected or ready, repeat the attempt of a connection periodically. Note that I do not accept any connection from non-master nodes at this stage. """ self.cluster_state = None # search, find, connect and identify to the primary master bootstrap = BootstrapManager(self, self.name, NodeTypes.ADMIN, self.uuid, self.server) data = bootstrap.getPrimaryConnection() (node, conn, uuid, num_partitions, num_replicas) = data self.master_node = node self.master_conn = conn self.uuid = uuid if self.pt is None: self.pt = PartitionTable(num_partitions, num_replicas) elif self.pt.getPartitions() != num_partitions: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of partitions is inconsistent') elif self.pt.getReplicas() != num_replicas: # XXX: shouldn't we recover instead of raising ? raise RuntimeError('the number of replicas is inconsistent') # passive handler self.master_conn.setHandler(self.master_event_handler) self.master_conn.ask(Packets.AskClusterState()) self.master_conn.ask(Packets.AskNodeInformation()) self.master_conn.ask(Packets.AskPartitionTable()) def sendPartitionTable(self, conn, min_offset, max_offset, uuid): # we have a pt self.pt.log() row_list = [] if max_offset == 0: max_offset = self.pt.getPartitions() try: for offset in xrange(min_offset, max_offset): row = [] try: for cell in self.pt.getCellList(offset): if uuid is None or cell.getUUID() == uuid: row.append((cell.getUUID(), cell.getState())) except TypeError: pass row_list.append((offset, row)) except IndexError: conn.notify(Errors.ProtocolError('invalid partition table offset')) else: conn.answer(Packets.AnswerPartitionList(self.pt.getID(), row_list))