def askLockedTransactions(self, conn): conn.answer( Packets.AnswerLockedTransactions( self.app.dm.getUnfinishedTIDDict()))
def askRebaseTransaction(self, conn, *args): conn.answer( Packets.AnswerRebaseTransaction(self.app.tm.rebase(conn, *args)))
def askTIDs(self, conn, *args): conn.answer(Packets.AnswerTIDs(self._askTIDs(*args)))
def invalidatePartitions(self, tid, prev_tid, partition_set): app = self.app app.setLastTransaction(tid) pt = app.pt trigger_set = set() untouched_dict = defaultdict(dict) for offset in xrange(pt.getPartitions()): try: last_max_tid = self.tid_list[offset][-1] except IndexError: last_max_tid = prev_tid if offset in partition_set: primary_list = [] node_list = [] cell_list = pt.getCellList(offset, readable=True) for cell in cell_list: node = cell.getNode() assert node.isConnected(), node if cell.backup_tid == prev_tid: if prev_tid == tid: # Connecting to upstream: any node is that is # up-to-date wrt upstream is candidate for being # primary. assert self.ignore_invalidations if app.isStorageReady(node.getUUID()): primary_list.append(node) continue # Let's given 4 TID t0,t1,t2,t3: if a cell is only # modified by t0 & t3 and has all data for t0, 4 values # are possible for its 'backup_tid' until it replicates # up to t3: t0, t1, t2 or t3 - 1 # Choosing the smallest one (t0) is easier to implement # but when leaving backup mode, we would always lose # data if the last full transaction does not modify # all partitions. t1 is wrong for the same reason. # So we have chosen the highest one (t3 - 1). # t2 should also work but maybe harder to implement. cell.backup_tid = add64(tid, -1) logging.debug( "partition %u: updating backup_tid of %r to %s", offset, cell, dump(cell.backup_tid)) else: assert cell.backup_tid < last_max_tid, ( cell.backup_tid, last_max_tid, prev_tid, tid) if app.isStorageReady(node.getUUID()): node_list.append(node) # Make sure we have a primary storage for this partition. if offset not in self.primary_partition_dict: self.primary_partition_dict[offset] = \ random.choice(primary_list or node_list) if node_list: self.tid_list[offset].append(tid) if primary_list: # Resume replication to secondary cells. self._triggerSecondary( self.primary_partition_dict[offset], offset, tid, cell_list) else: trigger_set.update(node_list) else: # Partition not touched, so increase 'backup_tid' of all # "up-to-date" replicas, without having to replicate. for cell in pt.getCellList(offset, readable=True): if last_max_tid <= cell.backup_tid: cell.backup_tid = tid untouched_dict[cell.getNode()][offset] = None elif last_max_tid <= cell.replicating: # Same for 'replicating' to avoid useless orders. logging.debug( "silently update replicating order" " of %s for partition %u, up to %s", uuid_str(cell.getUUID()), offset, dump(tid)) cell.replicating = tid for node, untouched_dict in untouched_dict.iteritems(): if app.isStorageReady(node.getUUID()): node.send(Packets.Replicate(tid, '', untouched_dict)) for node in trigger_set: self.triggerBackup(node) count = sum(map(len, self.tid_list)) if self.debug_tid_count < count: logging.debug("Maximum number of tracked tids: %u", count) self.debug_tid_count = count
def provideService(self): logging.info('provide backup') poll = self.em.poll app = self.app pt = app.pt while True: app.changeClusterState(ClusterStates.STARTING_BACKUP) bootstrap = BootstrapManager(self, NodeTypes.CLIENT, backup=app.name) # {offset -> node} self.primary_partition_dict = {} # [[tid]] self.tid_list = tuple([] for _ in xrange(pt.getPartitions())) try: while True: for node in pt.getNodeSet(readable=True): if not app.isStorageReady(node.getUUID()): break else: break poll(1) node, conn = bootstrap.getPrimaryConnection() try: app.changeClusterState(ClusterStates.BACKINGUP) del bootstrap, node self.ignore_invalidations = True conn.setHandler(BackupHandler(self)) conn.ask(Packets.AskLastTransaction()) # debug variable to log how big 'tid_list' can be. self.debug_tid_count = 0 while True: poll(1) except PrimaryFailure, msg: logging.error('upstream master is down: %s', msg) finally: app.backup_tid = pt.getBackupTid() try: conn.close() except PrimaryFailure: pass try: del self.pt except AttributeError: pass for node in app.nm.getClientList(True): node.getConnection().close() except StateChangedException, e: if e.args[0] != ClusterStates.STOPPING_BACKUP: raise app.changeClusterState(*e.args) tid = app.backup_tid # Wait for non-primary partitions to catch up, # so that all UP_TO_DATE cells are really UP_TO_DATE. # XXX: Another possibility could be to outdate such cells, and # they would be quickly updated at the beginning of the # RUNNING phase. This may simplify code. # Any unfinished replication from upstream will be truncated. while pt.getBackupTid(min) < tid: poll(1) last_tid = app.getLastTransaction() handler = EventHandler(app) if tid < last_tid: assert tid != ZERO_TID logging.warning("Truncating at %s (last_tid was %s)", dump(app.backup_tid), dump(last_tid)) else: # We will do a dummy truncation, just to leave backup mode, # so it's fine to start automatically if there's any # missing storage. # XXX: Consider using another method to leave backup mode, # at least when there's nothing to truncate. Because # in case of StoppedOperation during VERIFYING state, # this flag will be wrongly set to False. app._startup_allowed = True # If any error happened before reaching this line, we'd go back # to backup mode, which is the right mode to recover. del app.backup_tid # Now back to RECOVERY... return tid
def askNewOIDs(self, conn, num_oids): conn.answer(Packets.AnswerNewOIDs( self.app.tm.getNextOIDList(num_oids)))
def askStoreTransaction(self, conn, ttid, *txn_info): self.app.tm.register(conn.getUUID(), ttid) self.app.tm.vote(ttid, txn_info) conn.answer(Packets.AnswerStoreTransaction())
def askLastTransaction(self, conn): conn.answer( Packets.AnswerLastTransaction(self.app.getLastTransaction()))
def askNodeInformation(self, conn): self._notifyNodeInformation(conn) conn.answer(Packets.AnswerNodeInformation())
def askRecovery(self, conn): app = self.app conn.answer( Packets.AnswerRecovery(app.pt.getID(), app.backup_tid and app.pt.getBackupTid(), app.truncate_tid))
def askLastIDs(self, conn): tm = self.app.tm conn.answer(Packets.AnswerLastIDs(tm.getLastOID(), tm.getLastTID()))
def askClusterState(self, conn): state = self.app.getClusterState() conn.answer(Packets.AnswerClusterState(state))
def connectionCompleted(self, conn, new): self._notifyNodeInformation(conn) pt = self.app.pt conn.notify(Packets.SendPartitionTable(pt.getID(), pt.getRowList()))
def run(self): """ Recover the status about the cluster. Obtain the last OID, the last TID, and the last Partition Table ID from storage nodes, then get back the latest partition table or make a new table from scratch, if this is the first time. A new primary master may also arise during this phase. """ logging.info('begin the recovery of the status') app = self.app pt = app.pt app.changeClusterState(ClusterStates.RECOVERING) pt.clear() self.try_secondary = True # collect the last partition table available poll = app.em.poll while 1: if self.try_secondary: # Keep trying to connect to all other known masters, # to make sure there is a challege between each pair # of masters in the cluster. If we win, all connections # opened here will be closed. self.try_secondary = False node_list = [] for node in app.nm.getMasterList(): if not (node is app._node or node.isConnected(True)): # During recovery, master nodes are not put back in # DOWN state by handlers. This is done # entirely in this method (here and after this poll # loop), to minimize the notification packets. if not node.isDown(): node.setDown() node_list.append(node) ClientConnection(app, app.election_handler, node) if node_list: app.broadcastNodesInformation(node_list) poll(1) if pt.filled(): # A partition table exists, we are starting an existing # cluster. node_list = pt.getOperationalNodeSet() if app._startup_allowed: node_list = [ node for node in node_list if node.isPending() ] elif node_list: # we want all nodes to be there if we're going to truncate if app.truncate_tid: node_list = pt.getNodeSet() if not all(node.isPending() for node in node_list): continue elif app._startup_allowed or app.autostart: # No partition table and admin allowed startup, we are # creating a new cluster out of all pending nodes. node_list = app.nm.getStorageList(only_identified=True) if not app._startup_allowed and len(node_list) < app.autostart: continue else: continue if node_list and not any(node.getConnection().isPending() for node in node_list): if pt.filled(): if app.truncate_tid: node_list = app.nm.getIdentifiedList( pool_set={ uuid for uuid, tid in self.truncate_dict.iteritems() if not tid or app.truncate_tid < tid }) if node_list: truncate = Packets.Truncate(app.truncate_tid) for node in node_list: conn = node.getConnection() conn.send(truncate) self.connectionCompleted(conn, False) continue node_list = pt.getConnectedNodeList() break logging.info('startup allowed') for node in node_list: assert node.isPending(), node node.setRunning() for node in app.nm.getMasterList(): if not (node is app._node or node.isIdentified()): if node.isConnected(True): node.getConnection().close() assert node.isDown(), node elif not node.isDown(): assert self.try_secondary, node node.setDown() node_list.append(node) app.broadcastNodesInformation(node_list) if pt.getID() is None: logging.info('creating a new partition table') pt.make(node_list) self._notifyAdmins( Packets.SendPartitionTable(pt.getID(), pt.getRowList())) else: cell_list = pt.outdate() if cell_list: self._notifyAdmins( Packets.NotifyPartitionChanges(pt.setNextID(), cell_list)) if app.backup_tid: pt.setBackupTidDict(self.backup_tid_dict) app.backup_tid = pt.getBackupTid() logging.debug('cluster starts this partition table:') pt.log()
def askClusterState(self, conn): conn.answer(Packets.AnswerClusterState(self.app.cluster_state))
def askPartitionTable(self, conn): pt = self.app.pt conn.answer(Packets.AnswerPartitionTable(pt.getID(), pt.getRowList()))
def askPrimary(self, conn): master_node = self.app.master_node conn.answer(Packets.AnswerPrimary(master_node.getUUID()))
def electPrimary(self): """Elect a primary master node. The difficulty is that a master node must accept connections from others while attempting to connect to other master nodes at the same time. Note that storage nodes and client nodes may connect to self as well as master nodes.""" logging.info('begin the election of a primary master') client_handler = election.ClientElectionHandler(self) self.unconnected_master_node_set.clear() self.negotiating_master_node_set.clear() self.master_address_dict.clear() self.listening_conn.setHandler(election.ServerElectionHandler(self)) getByAddress = self.nm.getByAddress while True: # handle new connected masters for node in self.nm.getMasterList(): node.setUnknown() self.unconnected_master_node_set.add(node.getAddress()) # start the election process self.primary = None self.primary_master_node = None try: while (self.unconnected_master_node_set or self.negotiating_master_node_set): for addr in self.unconnected_master_node_set: self.negotiating_master_node_set.add(addr) ClientConnection( self, client_handler, # XXX: Ugly, but the whole election code will be # replaced soon getByAddress(addr)) self.unconnected_master_node_set.clear() self.em.poll(1) except ElectionFailure, m: # something goes wrong, clean then restart logging.error('election failed: %s', m) # Ask all connected nodes to reelect a single primary master. for conn in self.em.getClientList(): conn.notify(Packets.ReelectPrimary()) conn.abort() # Wait until the connections are closed. self.primary = None self.primary_master_node = None # XXX: Since poll does not wake up anymore every second, # the following time condition should be reviewed. # See also playSecondaryRole. t = time() + 10 while self.em.getClientList() and time() < t: try: self.em.poll(1) except ElectionFailure: pass # Close all connections. for conn in self.em.getClientList() + self.em.getServerList(): conn.close() else: # election succeed, stop the process self.primary = self.primary is None break
# AlreadyPendingError, to avoid making lcient wait for an unneeded # response. try: self.app.queueEvent(self._askStoreObject, conn, (oid, serial, compression, checksum, data, data_serial, ttid, unlock, request_time), key=(oid, ttid), raise_on_duplicate=unlock) except AlreadyPendingError: conn.answer(Errors.AlreadyPending(dump(oid))) except NotRegisteredError: # transaction was aborted, cancel this event logging.info('Forget store of %s:%s by %s delayed by %s', dump(oid), dump(serial), dump(ttid), dump(self.app.tm.getLockingTID(oid))) # send an answer as the client side is waiting for it conn.answer(Packets.AnswerStoreObject(0, oid, serial)) else: if SLOW_STORE is not None: duration = time.time() - request_time if duration > SLOW_STORE: logging.info('StoreObject delay: %.02fs', duration) conn.answer(Packets.AnswerStoreObject(0, oid, serial)) def askStoreObject(self, conn, oid, serial, compression, checksum, data, data_serial, ttid, unlock): if 1 < compression: raise ProtocolError('invalid compression value') # register the transaction self.app.tm.register(conn.getUUID(), ttid) if data or checksum != ZERO_HASH: # TODO: return an appropriate error packet
def playPrimaryRole(self): logging.info('play the primary role with %r', self.listening_conn) self.master_address_dict.clear() em = self.em packet = Packets.AnnouncePrimary() for conn in em.getConnectionList(): if conn.isListening(): conn.setHandler(identification.IdentificationHandler(self)) else: conn.notify(packet) # Primary master should rather establish connections to all # secondaries, rather than the other way around. This requires # a bit more work when a new master joins a cluster but makes # it easier to resolve UUID conflicts with minimal cluster # impact, and ensure primary master unicity (primary masters # become noisy, in that they actively try to maintain # connections to all other master nodes, so duplicate # primaries will eventually get in touch with each other and # resolve the situation with a duel). # TODO: only abort client connections, don't close server # connections as we want to have them in the end. Secondary # masters will reconnect nevertheless, but it's dirty. # Currently, it's not trivial to preserve connected nodes, # because of poor node status tracking during election. conn.abort() # If I know any storage node, make sure that they are not in the # running state, because they are not connected at this stage. for node in self.nm.getStorageList(): if node.isRunning(): node.setTemporarilyDown() if self.uuid is None: self.uuid = self.getNewUUID(None, self.server, NodeTypes.MASTER) logging.info('My UUID: ' + uuid_str(self.uuid)) else: in_conflict = self.nm.getByUUID(self.uuid) if in_conflict is not None: logging.warning('UUID conflict at election exit with %r', in_conflict) in_conflict.setUUID(None) # Do not restart automatically if ElectionFailure is raised, in order # to avoid a split of the database. For example, with 2 machines with # a master and a storage on each one and replicas=1, the secondary # master becomes primary in case of network failure between the 2 # machines but must not start automatically: otherwise, each storage # node would diverge. self._startup_allowed = False try: while True: self.runManager(RecoveryManager) try: self.runManager(VerificationManager) if not self.backup_tid: self.provideService() # self.provideService only returns without raising # when switching to backup mode. if self.backup_app is None: raise RuntimeError("No upstream cluster to backup" " defined in configuration") truncate = Packets.Truncate( self.backup_app.provideService()) except StoppedOperation, e: logging.critical('No longer operational') truncate = Packets.Truncate(*e.args) if e.args else None # Automatic restart except if we truncate or retry to. self._startup_allowed = not (self.truncate_tid or truncate) node_list = [] for node in self.nm.getIdentifiedList(): if node.isStorage() or node.isClient(): conn = node.getConnection() conn.notify(Packets.StopOperation()) if node.isClient(): conn.abort() continue if truncate: conn.notify(truncate) if node.isRunning(): node.setPending() node_list.append(node) self.broadcastNodesInformation(node_list) except StateChangedException, e: assert e.args[0] == ClusterStates.STOPPING self.shutdown()
def askVoteTransaction(self, conn, ttid): self.app.tm.vote(ttid) conn.answer(Packets.AnswerVoteTransaction())
def lastTransaction(self): self._askPrimary(Packets.AskLastTransaction()) return self.last_tid
def notifyUpstreamAdmin(self, addr): node_list = self.app.nm.getAdminList(only_identified=True) if node_list: min(node_list, key=lambda node: node.getUUID()).send( Packets.NotifyUpstreamAdmin(addr))
def undo(self, undone_tid, txn, tryToResolveConflict): txn_context = self._txn_container.get(txn) txn_info, txn_ext = self._getTransactionInformation(undone_tid) txn_oid_list = txn_info['oids'] # Regroup objects per partition, to ask a minimum set of storage. partition_oid_dict = {} for oid in txn_oid_list: partition = self.pt.getPartition(oid) try: oid_list = partition_oid_dict[partition] except KeyError: oid_list = partition_oid_dict[partition] = [] oid_list.append(oid) # Ask storage the undo serial (serial at which object's previous data # is) getCellList = self.pt.getCellList getCellSortKey = self.cp.getCellSortKey getConnForCell = self.cp.getConnForCell queue = self._thread_container.queue ttid = txn_context['ttid'] undo_object_tid_dict = {} snapshot_tid = p64(u64(self.last_tid) + 1) for partition, oid_list in partition_oid_dict.iteritems(): cell_list = getCellList(partition, readable=True) # We do want to shuffle before getting one with the smallest # key, so that all cells with the same (smallest) key has # identical chance to be chosen. shuffle(cell_list) storage_conn = getConnForCell(min(cell_list, key=getCellSortKey)) storage_conn.ask(Packets.AskObjectUndoSerial( ttid, snapshot_tid, undone_tid, oid_list), queue=queue, undo_object_tid_dict=undo_object_tid_dict) # Wait for all AnswerObjectUndoSerial. We might get OidNotFoundError, # meaning that objects in transaction's oid_list do not exist any # longer. This is the symptom of a pack, so forbid undoing transaction # when it happens. try: self.waitResponses(queue) except NEOStorageNotFoundError: self.dispatcher.forget_queue(queue) raise UndoError('non-undoable transaction') # Send undo data to all storage nodes. for oid in txn_oid_list: current_serial, undo_serial, is_current = undo_object_tid_dict[oid] if is_current: data = None else: # Serial being undone is not the latest version for this # object. This is an undo conflict, try to resolve it. try: # Load the latest version we are supposed to see data = self.load(oid, current_serial)[0] # Load the version we were undoing to undo_data = self.load(oid, undo_serial)[0] except NEOStorageNotFoundError: raise UndoError('Object not found while resolving undo ' 'conflict') # Resolve conflict try: data = tryToResolveConflict(oid, current_serial, undone_tid, undo_data, data) except ConflictError: raise UndoError('Some data were modified by a later ' \ 'transaction', oid) undo_serial = None self._store(txn_context, oid, current_serial, data, undo_serial) return None, txn_oid_list
except NonReadableCell: logging.info('Ignore store of %s:%s by %s: unassigned partition', dump(oid), dump(serial), dump(ttid)) locked = ZERO_TID except NotRegisteredError: # transaction was aborted, cancel this event logging.info('Forget store of %s:%s by %s delayed by %s', dump(oid), dump(serial), dump(ttid), dump(self.app.tm.getLockingTID(oid))) locked = ZERO_TID else: if request_time and SLOW_STORE is not None: duration = time.time() - request_time if duration > SLOW_STORE: logging.info('StoreObject delay: %.02fs', duration) conn.answer(Packets.AnswerStoreObject(locked)) def askStoreObject(self, conn, oid, serial, compression, checksum, data, data_serial, ttid): if 1 < compression: raise ProtocolError('invalid compression value') # register the transaction self.app.tm.register(conn, ttid) if data or checksum != ZERO_HASH: # TODO: return an appropriate error packet assert makeChecksum(data) == checksum assert data_serial is None else: checksum = data = None try: self._askStoreObject(conn, oid, serial, compression, checksum,
def connectionCompleted(self, conn, new): # ask the last IDs to perform the recovery conn.ask(Packets.AskRecovery())
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition): conn.answer( Packets.AnswerTIDsFrom( self.app.dm.getReplicationTIDList(min_tid, max_tid, length, partition)))
def run(self): """ Recover the status about the cluster. Obtain the last OID, the last TID, and the last Partition Table ID from storage nodes, then get back the latest partition table or make a new table from scratch, if this is the first time. """ logging.info('begin the recovery of the status') app = self.app pt = app.pt app.changeClusterState(ClusterStates.RECOVERING) pt.clear() # collect the last partition table available poll = app.em.poll while 1: poll(1) if pt.filled(): # A partition table exists, we are starting an existing # cluster. node_list = pt.getOperationalNodeSet() if app._startup_allowed: node_list = [ node for node in node_list if node.isPending() ] elif node_list: # we want all nodes to be there if we're going to truncate if app.truncate_tid: node_list = pt.getNodeSet() if not all(node.isPending() for node in node_list): continue elif app._startup_allowed or app.autostart: # No partition table and admin allowed startup, we are # creating a new cluster out of all pending nodes. node_list = app.nm.getStorageList(only_identified=True) if not app._startup_allowed and len(node_list) < app.autostart: continue else: continue if node_list and not any(node.getConnection().isPending() for node in node_list): if pt.filled(): if app.truncate_tid: node_list = app.nm.getIdentifiedList( pool_set={ uuid for uuid, tid in self.truncate_dict.iteritems() if not tid or app.truncate_tid < tid }) if node_list: truncate = Packets.Truncate(app.truncate_tid) for node in node_list: conn = node.getConnection() conn.notify(truncate) self.connectionCompleted(conn, False) continue node_list = pt.getConnectedNodeList() break logging.info('startup allowed') for node in node_list: assert node.isPending(), node node.setRunning() app.broadcastNodesInformation(node_list) if pt.getID() is None: logging.info('creating a new partition table') pt.make(node_list) self._notifyAdmins( Packets.SendPartitionTable(pt.getID(), pt.getRowList())) else: cell_list = pt.outdate() if cell_list: self._notifyAdmins( Packets.NotifyPartitionChanges(pt.setNextID(), cell_list)) if app.backup_tid: pt.setBackupTidDict(self.backup_tid_dict) app.backup_tid = pt.getBackupTid() logging.debug('cluster starts this partition table:') pt.log()
def askFinalTID(self, conn, ttid): conn.answer(Packets.AnswerFinalTID(self.app.tm.getFinalTID(ttid)))
def askLastIDs(self, conn): dm = self.app.dm dm.truncate() ltid, _, _, loid = dm.getLastIDs() conn.answer(Packets.AnswerLastIDs(loid, ltid))