Exemple #1
0
 def connectionLost(self, conn, new_state):
     uuid = conn.getUUID()
     self.backup_tid_dict.pop(uuid, None)
     self.truncate_dict.pop(uuid, None)
     node = self.app.nm.getByUUID(uuid)
     try:
         i = self.ask_pt.index(uuid)
     except ValueError:
         pass
     else:
         del self.ask_pt[i]
         if not i:
             if self.ask_pt:
                 self.app.nm.getByUUID(self.ask_pt[0]) \
                     .ask(Packets.AskPartitionTable())
             else:
                 logging.warning(
                     "Waiting for %r to come back."
                     " No other node has version %s of the partition table.",
                     node, self.target_ptid)
     if node.getState() == new_state:
         return
     node.setState(new_state)
     # broadcast to all so that admin nodes gets informed
     self.app.broadcastNodesInformation([node])
Exemple #2
0
 def addPendingNodes(self, conn, uuid_list):
     uuids = ', '.join(map(uuid_str, uuid_list))
     logging.debug('Add nodes %s', uuids)
     app = self.app
     state = app.getClusterState()
     # XXX: Would it be safe to allow more states ?
     if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP,
                      ClusterStates.BACKINGUP):
         raise ProtocolError('Can not add nodes in %s state' % state)
     # take all pending nodes
     node_list = list(
         app.pt.addNodeList(
             node for node in app.nm.getStorageList()
             if node.isPending() and node.getUUID() in uuid_list))
     if node_list:
         p = Packets.StartOperation(bool(app.backup_tid))
         for node in node_list:
             node.setRunning()
             node.notify(p)
         app.broadcastNodesInformation(node_list)
         conn.answer(
             Errors.Ack('Nodes added: %s' %
                        ', '.join(uuid_str(x.getUUID())
                                  for x in node_list)))
     else:
         logging.warning('No node added')
         conn.answer(Errors.Ack('No node added'))
Exemple #3
0
 def setup(self, reset=False, dedup=False):
     self.db.setup(reset, dedup)
     zodb_state = self.getConfiguration("zodb")
     if zodb_state:
         logging.warning("Ignoring configuration file for oid mapping."
                         " Reloading it from NEO storage.")
         zodb = cPickle.loads(zodb_state)
         for k, v in self.zodb:
             zodb[k].connect(v["storage"])
     else:
         zodb = {k: ZODB(**v) for k, v in self.zodb}
         x, = (x for x in zodb.itervalues() if not x.oid)
         x.setup(zodb)
         self.setConfiguration("zodb", cPickle.dumps(zodb))
     self.zodb_index, self.zodb = zip(*sorted(
         (x.shift_oid, x) for x in zodb.itervalues()))
     self.zodb_ltid = max(x.ltid for x in self.zodb)
     zodb = self.zodb[-1]
     self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1
     self.zodb_tid = self._getMaxPartition() is not None and \
         self.db.getLastTID(self.zodb_ltid) or 0
     if callable(self._import):  # XXX: why ?
         if self.zodb_tid == self.zodb_ltid:
             self._finished()
         else:
             self._import = self._import()
Exemple #4
0
    def outdate(self, lost_node=None):
        """Outdate all non-working nodes

        Do not outdate cells of 'lost_node' for partitions it was the last node
        to serve. This allows a cluster restart.
        """
        change_list = []
        fully_readable = all(cell.isReadable() for row in self.partition_list
                             for cell in row)
        for offset, row in enumerate(self.partition_list):
            lost = lost_node
            cell_list = []
            for cell in row:
                if cell.isReadable():
                    if cell.getNode().isRunning():
                        lost = None
                    else:
                        cell_list.append(cell)
            for cell in cell_list:
                if cell.getNode() is not lost:
                    cell.setState(CellStates.OUT_OF_DATE)
                    change_list.append(
                        (offset, cell.getUUID(), CellStates.OUT_OF_DATE))
        if fully_readable and change_list:
            logging.warning(self._first_outdated_message)
        return change_list
Exemple #5
0
 def pack(self, t, referencesf, gc=False):
     if gc:
         logging.warning(
             'Garbage Collection is not available in NEO,'
             ' please use an external tool. Packing without GC.')
     try:
         self.app.pack(t)
     except Exception:
         logging.exception('pack_time=%r', t)
         raise
Exemple #6
0
 def checkRange(self, conn, *args):
     if self.conn_dict.get(conn, self) != conn.getPeerId():
         # Ignore answers to old requests,
         # because we did nothing to cancel them.
         logging.info("ignored AnswerCheck*Range%r", args)
         return
     self.conn_dict[conn] = args
     answer_set = set(self.conn_dict.itervalues())
     if len(answer_set) > 1:
         for answer in answer_set:
             if type(answer) is not tuple:
                 return
         # TODO: Automatically tell corrupted cells to fix their data
         #       if we know a good source.
         #       For the moment, tell master to put them in CORRUPTED state
         #       and keep up checking if useful.
         uuid = self.app.uuid
         args = None if self.source is None else self.conn_dict[
             None if self.source.getUUID() ==
             uuid else self.source.getConnection()]
         uuid_list = []
         for conn, answer in self.conn_dict.items():
             if answer != args:
                 del self.conn_dict[conn]
                 if conn is None:
                     uuid_list.append(uuid)
                 else:
                     uuid_list.append(conn.getUUID())
                     self.app.closeClient(conn)
         p = Packets.NotifyPartitionCorrupted(self.partition, uuid_list)
         self.app.master_conn.send(p)
         if len(self.conn_dict) <= 1:
             logging.warning("check of partition %u aborted",
                             self.partition)
             self.queue.clear()
             self._nextPartition()
             return
     try:
         count, _, max_tid = args
     except ValueError:  # AnswerCheckSerialRange
         count, _, self.next_tid, _, max_oid = args
         if count < CHECK_COUNT:
             logging.debug("partition %u checked from %s to %s",
                           self.partition, dump(self.min_tid),
                           dump(self.max_tid))
             self._nextPartition()
             return
         self.next_oid = add64(max_oid, 1)
     else:  # AnswerCheckTIDRange
         if count < CHECK_COUNT:
             self.next_tid = self.min_tid
             self.next_oid = ZERO_OID
         else:
             self.next_tid = add64(max_tid, 1)
     self._nextRange()
Exemple #7
0
 def checkRange(self, conn, *args):
     if self.conn_dict.get(conn, self) != conn.getPeerId():
         # Ignore answers to old requests,
         # because we did nothing to cancel them.
         logging.info("ignored AnswerCheck*Range%r", args)
         return
     self.conn_dict[conn] = args
     answer_set = set(self.conn_dict.itervalues())
     if len(answer_set) > 1:
         for answer in answer_set:
             if type(answer) is not tuple:
                 return
         # TODO: Automatically tell corrupted cells to fix their data
         #       if we know a good source.
         #       For the moment, tell master to put them in CORRUPTED state
         #       and keep up checking if useful.
         uuid = self.app.uuid
         args = None if self.source is None else self.conn_dict[
             None if self.source.getUUID() == uuid
                  else self.source.getConnection()]
         uuid_list = []
         for conn, answer in self.conn_dict.items():
             if answer != args:
                 del self.conn_dict[conn]
                 if conn is None:
                     uuid_list.append(uuid)
                 else:
                     uuid_list.append(conn.getUUID())
                     self.app.closeClient(conn)
         p = Packets.NotifyPartitionCorrupted(self.partition, uuid_list)
         self.app.master_conn.notify(p)
         if len(self.conn_dict) <= 1:
             logging.warning("check of partition %u aborted", self.partition)
             self.queue.clear()
             self._nextPartition()
             return
     try:
         count, _, max_tid = args
     except ValueError: # AnswerCheckSerialRange
         count, _, self.next_tid, _, max_oid = args
         if count < CHECK_COUNT:
             logging.debug("partition %u checked from %s to %s",
                 self.partition, dump(self.min_tid), dump(self.max_tid))
             self._nextPartition()
             return
         self.next_oid = add64(max_oid, 1)
     else: # AnswerCheckTIDRange
         if count < CHECK_COUNT:
             self.next_tid = self.min_tid
             self.next_oid = ZERO_OID
         else:
             self.next_tid = add64(max_tid, 1)
     self._nextRange()
Exemple #8
0
 def _finished(self):
     logging.warning(
         "All data are imported. You should change"
         " your configuration to use the native backend and restart.")
     self._import = None
     for x in """getObject getReplicationTIDList getReplicationObjectList
                 _fetchObject _getDataTID getLastObjectTID
              """.split():
         setattr(self, x, getattr(self.db, x))
     for zodb in self.zodb:
         zodb.close()
     self.zodb = None
Exemple #9
0
 def connectionLost(self, conn):
     try:
         del self.conn_dict[conn]
     except KeyError:
         return
     if self.source is not None and self.source.getConnection() is conn:
         del self.source
     elif len(self.conn_dict) > 1:
         logging.warning("node lost but keep up checking partition %u",
                         self.partition)
         return
     logging.warning("check of partition %u aborted", self.partition)
     self._nextPartition()
Exemple #10
0
 def connectionLost(self, conn):
     try:
         del self.conn_dict[conn]
     except KeyError:
         return
     if self.source is not None and self.source.getConnection() is conn:
         del self.source
     elif len(self.conn_dict) > 1:
         logging.warning("node lost but keep up checking partition %u",
                         self.partition)
         return
     logging.warning("check of partition %u aborted", self.partition)
     self._nextPartition()
Exemple #11
0
    def _acceptIdentification(self, node, uuid, num_partitions, num_replicas,
                              your_uuid, primary, known_master_list):
        app = self.app

        # Register new master nodes.
        found = False
        conn_address = node.getAddress()
        for node_address, node_uuid in known_master_list:
            if node_address == conn_address:
                assert uuid == node_uuid, (dump(uuid), dump(node_uuid))
                found = True
            n = app.nm.getByAddress(node_address)
            if n is None:
                n = app.nm.createMaster(address=node_address)
            if node_uuid is not None and n.getUUID() != node_uuid:
                n.setUUID(node_uuid)
        assert found, (node, dump(uuid), known_master_list)

        conn = node.getConnection()
        if primary is not None:
            primary_node = app.nm.getByAddress(primary)
            if primary_node is None:
                # I don't know such a node. Probably this information
                # is old. So ignore it.
                logging.warning('Unknown primary master: %s. Ignoring.',
                                primary)
                return
            else:
                if app.trying_master_node is not primary_node:
                    app.trying_master_node = None
                    conn.close()
                app.primary_master_node = primary_node
        else:
            if app.primary_master_node is not None:
                # The primary master node is not a primary master node
                # any longer.
                app.primary_master_node = None

            app.trying_master_node = None
            conn.close()
            return

        # the master must give an UUID
        if your_uuid is None:
            raise ProtocolError('No UUID supplied')
        app.uuid = your_uuid
        logging.info('Got an UUID: %s', dump(app.uuid))

        # Always create partition table
        app.pt = PartitionTable(num_partitions, num_replicas)
Exemple #12
0
    def _acceptIdentification(self, node, uuid, num_partitions,
            num_replicas, your_uuid, primary, known_master_list):
        app = self.app

        # Register new master nodes.
        found = False
        conn_address = node.getAddress()
        for node_address, node_uuid in known_master_list:
            if node_address == conn_address:
                assert uuid == node_uuid, (dump(uuid), dump(node_uuid))
                found = True
            n = app.nm.getByAddress(node_address)
            if n is None:
                n = app.nm.createMaster(address=node_address)
            if node_uuid is not None and n.getUUID() != node_uuid:
                n.setUUID(node_uuid)
        assert found, (node, dump(uuid), known_master_list)

        conn = node.getConnection()
        if primary is not None:
            primary_node = app.nm.getByAddress(primary)
            if primary_node is None:
                # I don't know such a node. Probably this information
                # is old. So ignore it.
                logging.warning('Unknown primary master: %s. Ignoring.',
                                primary)
                return
            else:
                if app.trying_master_node is not primary_node:
                    app.trying_master_node = None
                    conn.close()
                app.primary_master_node = primary_node
        else:
            if app.primary_master_node is not None:
                # The primary master node is not a primary master node
                # any longer.
                app.primary_master_node = None

            app.trying_master_node = None
            conn.close()
            return

        # the master must give an UUID
        if your_uuid is None:
            raise ProtocolError('No UUID supplied')
        app.uuid = your_uuid
        logging.info('Got an UUID: %s', dump(app.uuid))

        # Always create partition table
        app.pt = PartitionTable(num_partitions, num_replicas)
Exemple #13
0
def retry_if_locked(f, *args):
    try:
        return f(*args)
    except sqlite3.OperationalError as e:
        x = e.args[0]
        if x != 'database is locked':
            raise
        msg = traceback.format_exception_only(type(e), e)
        msg += traceback.format_stack()
        logging.warning(''.join(msg))
        while 1:
            try:
                return f(*args)
            except sqlite3.OperationalError as e:
                if e.args[0] != x:
                    raise
Exemple #14
0
def retry_if_locked(f, *args):
    try:
        return f(*args)
    except sqlite3.OperationalError as e:
        x = e.args[0]
        if x != 'database is locked':
            raise
        msg = traceback.format_exception_only(type(e), e)
        msg += traceback.format_stack()
        logging.warning(''.join(msg))
        while 1:
            try:
                return f(*args)
            except sqlite3.OperationalError as e:
                if e.args[0] != x:
                    raise
Exemple #15
0
 def checkReplicas(self, conn, partition_dict, min_tid, max_tid):
     app = self.app
     pt = app.pt
     backingup = bool(app.backup_tid)
     if not max_tid:
         max_tid = pt.getCheckTid(partition_dict) if backingup else \
             app.getLastTransaction()
     if min_tid > max_tid:
         logging.warning("nothing to check: min_tid=%s > max_tid=%s",
                         dump(min_tid), dump(max_tid))
     else:
         getByUUID = app.nm.getByUUID
         node_set = set()
         for offset, source in partition_dict.iteritems():
             # XXX: For the moment, code checking replicas is unable to fix
             #      corrupted partitions (when a good cell is known)
             #      so only check readable ones.
             #      (see also Checker._nextPartition of storage)
             cell_list = pt.getCellList(offset, True)
             #cell_list = [cell for cell in pt.getCellList(offset)
             #                  if not cell.isOutOfDate()]
             if len(cell_list) + (backingup and not source) <= 1:
                 continue
             for cell in cell_list:
                 node = cell.getNode()
                 if node in node_set:
                     break
             else:
                 node_set.add(node)
             if source:
                 source = '', getByUUID(source).getAddress()
             else:
                 readable = [
                     cell for cell in cell_list if cell.isReadable()
                 ]
                 if 1 == len(readable) < len(cell_list):
                     source = '', readable[0].getAddress()
                 elif backingup:
                     source = app.backup_app.name, random.choice(
                         app.backup_app.pt.getCellList(
                             offset, readable=True)).getAddress()
                 else:
                     source = '', None
             node.getConnection().notify(
                 Packets.CheckPartition(offset, source, min_tid, max_tid))
     conn.answer(Errors.Ack(''))
Exemple #16
0
 def checkReplicas(self, conn, partition_dict, min_tid, max_tid):
     app = self.app
     pt = app.pt
     backingup = bool(app.backup_tid)
     if not max_tid:
         max_tid = pt.getCheckTid(partition_dict) if backingup else app.getLastTransaction()
     if min_tid > max_tid:
         logging.warning("nothing to check: min_tid=%s > max_tid=%s", dump(min_tid), dump(max_tid))
     else:
         getByUUID = app.nm.getByUUID
         node_set = set()
         for offset, source in partition_dict.iteritems():
             # XXX: For the moment, code checking replicas is unable to fix
             #      corrupted partitions (when a good cell is known)
             #      so only check readable ones.
             #      (see also Checker._nextPartition of storage)
             cell_list = pt.getCellList(offset, True)
             # cell_list = [cell for cell in pt.getCellList(offset)
             #                  if not cell.isOutOfDate()]
             if len(cell_list) + (backingup and not source) <= 1:
                 continue
             for cell in cell_list:
                 node = cell.getNode()
                 if node in node_set:
                     break
             else:
                 node_set.add(node)
             if source:
                 source = "", getByUUID(source).getAddress()
             else:
                 readable = [cell for cell in cell_list if cell.isReadable()]
                 if 1 == len(readable) < len(cell_list):
                     source = "", readable[0].getAddress()
                 elif backingup:
                     source = (
                         app.backup_app.name,
                         random.choice(app.backup_app.pt.getCellList(offset, readable=True)).getAddress(),
                     )
                 else:
                     source = "", None
             node.getConnection().notify(Packets.CheckPartition(offset, source, min_tid, max_tid))
     conn.answer(Errors.Ack(""))
Exemple #17
0
 def abort(self, message=''):
     offset = self.current_partition
     if offset is None:
         return
     del self.current_partition
     logging.warning('replication aborted for partition %u%s', offset,
                     message and ' (%s)' % message)
     if offset in self.partition_dict:
         # XXX: Try another partition if possible, to increase probability to
         #      connect to another node. It would be better to explicitely
         #      search for another node instead.
         tid = self.replicate_dict.pop(offset, None) or self.replicate_tid
         if self.replicate_dict:
             self._nextPartition()
             self.replicate_dict[offset] = tid
         else:
             self.replicate_dict[offset] = tid
             self._nextPartition()
     else:  # partition removed
         self._nextPartition()
Exemple #18
0
 def abort(self, message=''):
     offset = self.current_partition
     if offset is None:
         return
     del self.current_partition
     logging.warning('replication aborted for partition %u%s',
                     offset, message and ' (%s)' % message)
     if offset in self.partition_dict:
         # XXX: Try another partition if possible, to increase probability to
         #      connect to another node. It would be better to explicitely
         #      search for another node instead.
         tid = self.replicate_dict.pop(offset, None) or self.replicate_tid
         if self.replicate_dict:
             self._nextPartition()
             self.replicate_dict[offset] = tid
         else:
             self.replicate_dict[offset] = tid
             self._nextPartition()
     else: # partition removed
         self._nextPartition()
Exemple #19
0
 def _setup(self):
     self.db._setup()
     zodb_state = self.getConfiguration("zodb")
     if zodb_state:
         logging.warning("Ignoring configuration file for oid mapping."
                         " Reloading it from NEO storage.")
         zodb = cPickle.loads(zodb_state)
         for k, v in self.zodb:
             zodb[k].connect(v["storage"])
     else:
         zodb = {k: ZODB(**v) for k, v in self.zodb}
         x, = (x for x in zodb.itervalues() if not x.oid)
         x.setup(zodb)
         self.setConfiguration("zodb", cPickle.dumps(zodb))
     self.zodb_index, self.zodb = zip(*sorted(
         (x.shift_oid, x) for x in zodb.itervalues()))
     self.zodb_ltid = max(x.ltid for x in self.zodb)
     zodb = self.zodb[-1]
     self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1
     self.zodb_tid = self.db.getLastTID(self.zodb_ltid) or 0
     self._import = self._import()
Exemple #20
0
 def addPendingNodes(self, conn, uuid_list):
     uuids = ', '.join(map(uuid_str, uuid_list))
     logging.debug('Add nodes %s', uuids)
     app = self.app
     # take all pending nodes
     node_list = list(
         app.pt.addNodeList(
             node for node in app.nm.getStorageList()
             if node.isPending() and node.getUUID() in uuid_list))
     if node_list:
         for node in node_list:
             node.setRunning()
             app.startStorage(node)
         app.broadcastNodesInformation(node_list)
         conn.answer(
             Errors.Ack('Nodes added: %s' %
                        ', '.join(uuid_str(x.getUUID())
                                  for x in node_list)))
     else:
         logging.warning('No node added')
         conn.answer(Errors.Ack('No node added'))
Exemple #21
0
 def setup(self, reset=0):
     self.db.setup(reset)
     zodb_state = self.getConfiguration("zodb")
     if zodb_state:
         logging.warning("Ignoring configuration file for oid mapping."
                         " Reloading it from NEO storage.")
         zodb = cPickle.loads(zodb_state)
         for k, v in self.zodb:
             zodb[k].connect(v["storage"])
     else:
         zodb = {k: ZODB(**v) for k, v in self.zodb}
         x, = (x for x in zodb.itervalues() if not x.oid)
         x.setup(zodb)
         self.setConfiguration("zodb", cPickle.dumps(zodb))
     self.zodb_index, self.zodb = zip(*sorted(
         (x.shift_oid, x) for x in zodb.itervalues()))
     self.zodb_ltid = max(x.ltid for x in self.zodb)
     zodb = self.zodb[-1]
     self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1
     self.zodb_tid = self.db.getLastTID(self.zodb_ltid) or 0
     self._import = self._import()
Exemple #22
0
    def _acceptIdentification(self, node, peer_uuid, num_partitions,
                              num_replicas, your_uuid, primary,
                              known_master_list):
        app = self.app

        # Register new master nodes.
        for address, uuid in known_master_list:
            if app.server == address:
                # This is self.
                assert node.getAddress() != primary or uuid == your_uuid, (
                    uuid_str(uuid), uuid_str(your_uuid))
                continue
            n = app.nm.getByAddress(address)
            if n is None:
                n = app.nm.createMaster(address=address)

        if primary is not None:
            # The primary master is defined.
            if app.primary_master_node is not None \
                    and app.primary_master_node.getAddress() != primary:
                # There are multiple primary master nodes. This is
                # dangerous.
                raise ElectionFailure, 'multiple primary master nodes'
            primary_node = app.nm.getByAddress(primary)
            if primary_node is None:
                # I don't know such a node. Probably this information
                # is old. So ignore it.
                logging.warning('received an unknown primary node')
            else:
                # Whatever the situation is, I trust this master.
                app.primary = False
                app.primary_master_node = primary_node
                # Stop waiting for connections than primary master's to
                # complete to exit election phase ASAP.
                app.negotiating_master_node_set.clear()
                return

        self.elect(None, node.getAddress())
Exemple #23
0
    def _acceptIdentification(self, node, peer_uuid, num_partitions,
            num_replicas, your_uuid, primary, known_master_list):
        app = self.app

        # Register new master nodes.
        for address, uuid in known_master_list:
            if app.server == address:
                # This is self.
                assert node.getAddress() != primary or uuid == your_uuid, (
                    uuid_str(uuid), uuid_str(your_uuid))
                continue
            n = app.nm.getByAddress(address)
            if n is None:
                n = app.nm.createMaster(address=address)

        if primary is not None:
            # The primary master is defined.
            if app.primary_master_node is not None \
                    and app.primary_master_node.getAddress() != primary:
                # There are multiple primary master nodes. This is
                # dangerous.
                raise ElectionFailure, 'multiple primary master nodes'
            primary_node = app.nm.getByAddress(primary)
            if primary_node is None:
                # I don't know such a node. Probably this information
                # is old. So ignore it.
                logging.warning('received an unknown primary node')
            else:
                # Whatever the situation is, I trust this master.
                app.primary = False
                app.primary_master_node = primary_node
                # Stop waiting for connections than primary master's to
                # complete to exit election phase ASAP.
                app.negotiating_master_node_set.clear()
                return

        self.elect(None, node.getAddress())
Exemple #24
0
 def addPendingNodes(self, conn, uuid_list):
     uuids = ", ".join(map(uuid_str, uuid_list))
     logging.debug("Add nodes %s", uuids)
     app = self.app
     state = app.getClusterState()
     # XXX: Would it be safe to allow more states ?
     if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP, ClusterStates.BACKINGUP):
         raise ProtocolError("Can not add nodes in %s state" % state)
     # take all pending nodes
     node_list = list(
         app.pt.addNodeList(
             node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list
         )
     )
     if node_list:
         p = Packets.StartOperation(bool(app.backup_tid))
         for node in node_list:
             node.setRunning()
             node.notify(p)
         app.broadcastNodesInformation(node_list)
         conn.answer(Errors.Ack("Nodes added: %s" % ", ".join(uuid_str(x.getUUID()) for x in node_list)))
     else:
         logging.warning("No node added")
         conn.answer(Errors.Ack("No node added"))
Exemple #25
0
 def connectionLost(self, conn, new_state):
     uuid = conn.getUUID()
     self.backup_tid_dict.pop(uuid, None)
     self.truncate_dict.pop(uuid, None)
     node = self.app.nm.getByUUID(uuid)
     try:
         i = self.ask_pt.index(uuid)
     except ValueError:
         pass
     else:
         del self.ask_pt[i]
         if not i:
             if self.ask_pt:
                 self.app.nm.getByUUID(self.ask_pt[0]) \
                     .ask(Packets.AskPartitionTable())
             else:
                 logging.warning("Waiting for %r to come back."
                     " No other node has version %s of the partition table.",
                     node, self.target_ptid)
     if node.getState() == new_state:
         return
     node.setState(new_state)
     # broadcast to all so that admin nodes gets informed
     self.app.broadcastNodesInformation([node])
Exemple #26
0
 def provideService(self):
     logging.info('provide backup')
     poll = self.em.poll
     app = self.app
     pt = app.pt
     while True:
         app.changeClusterState(ClusterStates.STARTING_BACKUP)
         bootstrap = BootstrapManager(self,
                                      NodeTypes.CLIENT,
                                      backup=app.name)
         # {offset -> node}
         self.primary_partition_dict = {}
         # [[tid]]
         self.tid_list = tuple([] for _ in xrange(pt.getPartitions()))
         try:
             while True:
                 for node in pt.getNodeSet(readable=True):
                     if not app.isStorageReady(node.getUUID()):
                         break
                 else:
                     break
                 poll(1)
             node, conn = bootstrap.getPrimaryConnection()
             try:
                 app.changeClusterState(ClusterStates.BACKINGUP)
                 del bootstrap, node
                 self.ignore_invalidations = True
                 conn.setHandler(BackupHandler(self))
                 conn.ask(Packets.AskLastTransaction())
                 # debug variable to log how big 'tid_list' can be.
                 self.debug_tid_count = 0
                 while True:
                     poll(1)
             except PrimaryFailure, msg:
                 logging.error('upstream master is down: %s', msg)
             finally:
                 app.backup_tid = pt.getBackupTid()
                 try:
                     conn.close()
                 except PrimaryFailure:
                     pass
                 try:
                     del self.pt
                 except AttributeError:
                     pass
                 for node in app.nm.getClientList(True):
                     node.getConnection().close()
         except StateChangedException, e:
             if e.args[0] != ClusterStates.STOPPING_BACKUP:
                 raise
             app.changeClusterState(*e.args)
             tid = app.backup_tid
             # Wait for non-primary partitions to catch up,
             # so that all UP_TO_DATE cells are really UP_TO_DATE.
             # XXX: Another possibility could be to outdate such cells, and
             #      they would be quickly updated at the beginning of the
             #      RUNNING phase. This may simplify code.
             # Any unfinished replication from upstream will be truncated.
             while pt.getBackupTid(min) < tid:
                 poll(1)
             last_tid = app.getLastTransaction()
             handler = EventHandler(app)
             if tid < last_tid:
                 assert tid != ZERO_TID
                 logging.warning("Truncating at %s (last_tid was %s)",
                                 dump(app.backup_tid), dump(last_tid))
             else:
                 # We will do a dummy truncation, just to leave backup mode,
                 # so it's fine to start automatically if there's any
                 # missing storage.
                 # XXX: Consider using another method to leave backup mode,
                 #      at least when there's nothing to truncate. Because
                 #      in case of StoppedOperation during VERIFYING state,
                 #      this flag will be wrongly set to False.
                 app._startup_allowed = True
             # If any error happened before reaching this line, we'd go back
             # to backup mode, which is the right mode to recover.
             del app.backup_tid
             # Now back to RECOVERY...
             return tid
Exemple #27
0
 def pack(self, t, referencesf, gc=False):
     if gc:
         logging.warning(
             'Garbage Collection is not available in NEO,'
             ' please use an external tool. Packing without GC.')
     self.app.pack(t)
Exemple #28
0
    def tweak(self, drop_list=()):
        """Optimize partition table

        This reassigns cells in 4 ways:
        - Discard cells of nodes listed in 'drop_list'. For partitions with too
          few readable cells, some cells are instead marked as FEEDING. This is
          a preliminary step to drop these nodes, otherwise the partition table
          could become non-operational.
          In fact, the code touching these cells is disabled (see NOTE below).
        - Other nodes must have the same number of non-feeding cells, off by 1.
        - When a transaction creates new objects (oids are roughly allocated
          sequentially), we expect better performance by maximizing the number
          of involved nodes (i.e. parallelizing writes).
        - For maximum resiliency, cells of each partition are assigned as far
          as possible from each other, by checking the topology path of nodes.

        Examples of optimal partition tables with np=10, nr=1 and 5 nodes:

          UU...  ..UU.
          ..UU.  U...U
          U...U  .UU..
          .UU..  ...UU
          ...UU  UU...
          UU...  ..UU.
          ..UU.  U...U
          U...U  .UU..
          .UU..  ...UU
          ...UU  UU...

        The above 2 PT only differ by permutation of nodes, and this method
        plays on it to minimize the resulting amount of replication.
        For performance reasons, this algorithm uses a heuristic.

        When (np * nr) is not a multiple of the number of nodes, some nodes
        have 1 extra cell compared to other. In such case, other optimal PT
        could be considered by rotation of the partitions. Actually np times
        more, but it's not worth it since they don't differ enough (if np is
        big enough) and we don't already do an exhaustive search.
        Example with np=3, nr=1 and 2 nodes:

          U.  .U  U.
          .U  U.  U.
          U.  U.  .U

        For the topology, let's consider an example with paths of the form
        (room, machine, disk):
        - if there are more rooms than the number of replicas, 2 cells of the
          same partition must not be assigned in the same room;
        - otherwise, topology paths are checked at a deeper depth,
          e.g. not on the same machine and distributed evenly
               (off by 1) among rooms.
        But the topology is expected to be optimal, otherwise it is ignored.
        In some cases, we could fall back to a non-optimal topology but
        that would cause extra replication if the user wants to fix it.
        """
        # Collect some data in a usable form for the rest of the method.
        node_list = {
            node: {}
            for node in self.count_dict if node not in drop_list
        }
        if not node_list:
            raise neo.lib.pt.PartitionTableException("Can't remove all nodes.")
        drop_list = defaultdict(list)
        for offset, row in enumerate(self.partition_list):
            for cell in row:
                cell_dict = node_list.get(cell.getNode())
                if cell_dict is None:
                    drop_list[offset].append(cell)
                else:
                    cell_dict[offset] = cell
        # The sort by node id is cosmetic, to prefer result like the first one
        # in __doc__.
        node_list = sorted(node_list.iteritems(), key=lambda x: x[0].getUUID())

        # Generate an optimal PT.
        node_count = len(node_list)
        repeats = min(self.nr + 1, node_count)
        x = [[] for _ in xrange(node_count)]
        i = 0
        for offset in xrange(self.np):
            for _ in xrange(repeats):
                x[i % node_count].append(offset)
                i += 1
        option_dict = Counter(map(tuple, x))

        # Initialize variables/functions to optimize the topology.
        devpath_max = []
        devpaths = [()] * node_count
        if repeats > 1:
            _devpaths = [x[0].devpath for x in node_list]
            max_depth = min(map(len, _devpaths))
            depth = 0
            while 1:
                if depth < max_depth:
                    depth += 1
                    x = Counter(x[:depth] for x in _devpaths)
                    n = len(x)
                    x = set(x.itervalues())
                    # TODO: Prove it works. If the code turns out to be:
                    #       - too pessimistic, the topology is ignored when
                    #         resiliency could be maximized;
                    #       - or worse too optimistic, in which case this
                    #         method raises, possibly after a very long time.
                    if len(x) == 1 or max(x) * repeats <= node_count:
                        i, x = divmod(repeats, n)
                        devpath_max.append((i + 1, x) if x else (i, n))
                        if n < repeats:
                            continue
                        devpaths = [x[:depth] for x in _devpaths]
                        break
                logging.warning(
                    "Can't maximize resiliency: fix the topology"
                    " of your storage nodes and make sure they're all running."
                    " %s storage device failure(s) may be enough to lose all"
                    " the database." % (repeats - 1))
                break
        topology = [{} for _ in xrange(self.np)]

        def update_topology():
            for offset in option:
                n = topology[offset]
                for i, (j, k) in zip(devpath, devpath_max):
                    try:
                        i, x = n[i]
                    except KeyError:
                        n[i] = i, x = [0, {}]
                    if i == j or i + 1 == j and k == sum(
                            1 for i in n.itervalues() if i[0] == j):
                        # Too many cells would be assigned at this topology
                        # node.
                        return False
                    n = x
            # The topology may be optimal with this option. Apply it.
            for offset in option:
                n = topology[offset]
                for i in devpath:
                    n = n[i]
                    n[0] += 1
                    n = n[1]
            return True

        def revert_topology():
            for offset in option:
                n = topology[offset]
                for i in devpath:
                    n = n[i]
                    n[0] -= 1
                    n = n[1]

        # Strategies to find the "best" permutation of nodes.
        def node_options():
            # The second part of the key goes with the above cosmetic sort.
            option_list = sorted(option_dict, key=lambda x: (-len(x), x))
            # 1. Search for solution that does not cause extra replication.
            #    This is important because tweak() must does nothing if it's
            #    called a second time whereas the list of nodes hasn't changed.
            result = []
            for i, (_, cell_dict) in enumerate(node_list):
                option = {
                    offset
                    for offset, cell in cell_dict.iteritems()
                    if not cell.isFeeding()
                }
                x = filter(option.issubset, option_list)
                if not x:
                    break
                result.append((i, x))
            else:
                yield result
            # 2. We have to move cells. Evaluating all options would have
            #    a complexity of O(node_count!), which is clearly too slow,
            #    so we use a heuristic.
            #    For each node, we compare the resulting amount of replication
            #    in the best (min_cost) and worst (max_cost) case, and we first
            #    iterate over nodes with the biggest difference. This minimizes
            #    the impact of bad allocation patterns for the last nodes.
            result = []
            np_complement = frozenset(xrange(self.np)).difference
            for i, (_, cell_dict) in enumerate(node_list):
                cost_list = []
                for x, option in enumerate(option_list):
                    discard = [0, 0]
                    for offset in np_complement(option):
                        cell = cell_dict.get(offset)
                        if cell:
                            discard[cell.isReadable()] += 1
                    cost_list.append(((discard[1], discard[0]), x))
                cost_list.sort()
                min_cost = cost_list[0][0]
                max_cost = cost_list[-1][0]
                result.append(
                    (min_cost[0] - max_cost[0], min_cost[1] - max_cost[1], i,
                     [option_list[x[1]] for x in cost_list]))
            result.sort()
            yield result

        # The main loop, which is where we evaluate options.
        new = []  # the solution
        stack = []  # data recursion

        def options():
            x = node_options[len(new)]
            return devpaths[x[-2]], iter(x[-1])

        for node_options in node_options():  # for each strategy
            devpath, iter_option = options()
            while 1:
                try:
                    option = next(iter_option)
                except StopIteration:
                    if new:
                        devpath, iter_option = stack.pop()
                        option = new.pop()
                        revert_topology()
                        option_dict[option] += 1
                        continue
                    break
                if option_dict[option] and update_topology():
                    new.append(option)
                    if len(new) == node_count:
                        break
                    stack.append((devpath, iter_option))
                    devpath, iter_option = options()
                    option_dict[option] -= 1
            if new:
                break
        else:
            raise AssertionError

        # Apply the solution.

        if self._id is None:
            self._id = 1
            self.num_filled_rows = self.np
            new_state = CellStates.UP_TO_DATE
        else:
            new_state = CellStates.OUT_OF_DATE

        changed_list = []
        outdated_list = [repeats] * self.np
        discard_list = defaultdict(list)
        for i, offset_list in enumerate(new):
            node, cell_dict = node_list[node_options[i][-2]]
            for offset in offset_list:
                cell = cell_dict.pop(offset, None)
                if cell is None:
                    self.count_dict[node] += 1
                    self.partition_list[offset].append(Cell(node, new_state))
                    changed_list.append((offset, node.getUUID(), new_state))
                elif cell.isReadable():
                    if cell.isFeeding():
                        cell.setState(CellStates.UP_TO_DATE)
                        changed_list.append(
                            (offset, node.getUUID(), CellStates.UP_TO_DATE))
                    outdated_list[offset] -= 1
            for offset, cell in cell_dict.iteritems():
                discard_list[offset].append(cell)
        # NOTE: The following line disables the next 2 lines, which actually
        #       causes cells in drop_list to be discarded, now or later;
        #       drop_list could be renamed into ignore_list.
        #       1. Deleting data partition per partition is a lot of work, so
        #          why ask nodes in drop_list to do that when the goal is
        #          simply to trash the whole underlying database?
        #       2. By excluding nodes from a tweak, it becomes possible to have
        #          parts of the partition table that are tweaked differently.
        #          This may require to temporarily change the number of
        #          replicas for the part being tweaked. In the future, this
        #          number may be specified in the 'tweak' command, to avoid
        #          race conditions with setUpToDate().
        #       Overall, a common use case is when importing a ZODB to NEO,
        #       to keep the initial importing node up until the database is
        #       split and replicated to the final nodes.
        drop_list = {}
        for offset, drop_list in drop_list.iteritems():
            discard_list[offset] += drop_list
        # We have sorted cells to discard in order to first deallocate nodes
        # in drop_list, and have feeding cells in other nodes.
        # The following loop also makes sure not to discard cells too quickly,
        # by keeping a minimum of 'repeats' readable cells.
        for offset, outdated in enumerate(outdated_list):
            row = self.partition_list[offset]
            for cell in discard_list[offset]:
                if outdated and cell.isReadable():
                    outdated -= 1
                    if cell.isFeeding():
                        continue
                    state = CellStates.FEEDING
                    cell.setState(state)
                else:
                    self.count_dict[cell.getNode()] -= 1
                    state = CellStates.DISCARDED
                    row.remove(cell)
                changed_list.append((offset, cell.getUUID(), state))

        assert self.operational(), changed_list
        return changed_list
Exemple #29
0
 def _handleConflicts(self, txn_context, tryToResolveConflict):
     result = []
     append = result.append
     # Check for conflicts
     data_dict = txn_context['data_dict']
     object_base_serial_dict = txn_context['object_base_serial_dict']
     object_serial_dict = txn_context['object_serial_dict']
     conflict_serial_dict = txn_context['conflict_serial_dict'].copy()
     txn_context['conflict_serial_dict'].clear()
     resolved_conflict_serial_dict = txn_context[
         'resolved_conflict_serial_dict']
     for oid, conflict_serial_set in conflict_serial_dict.iteritems():
         conflict_serial = max(conflict_serial_set)
         serial = object_serial_dict[oid]
         if ZERO_TID in conflict_serial_set:
           if 1:
             # XXX: disable deadlock avoidance code until it is fixed
             logging.info('Deadlock avoidance on %r:%r',
                 dump(oid), dump(serial))
             # 'data' parameter of ConflictError is only used to report the
             # class of the object. It doesn't matter if 'data' is None
             # because the transaction is too big.
             try:
                 data = data_dict[oid]
             except KeyError:
                 data = txn_context['cache_dict'][oid]
           else:
             # Storage refused us from taking object lock, to avoid a
             # possible deadlock. TID is actually used for some kind of
             # "locking priority": when a higher value has the lock,
             # this means we stored objects "too late", and we would
             # otherwise cause a deadlock.
             # To recover, we must ask storages to release locks we
             # hold (to let possibly-competing transactions acquire
             # them), and requeue our already-sent store requests.
             # XXX: currently, brute-force is implemented: we send
             # object data again.
             # WARNING: not maintained code
             logging.info('Deadlock avoidance triggered on %r:%r',
                 dump(oid), dump(serial))
             for store_oid, store_data in data_dict.iteritems():
                 store_serial = object_serial_dict[store_oid]
                 if store_data is CHECKED_SERIAL:
                     self._checkCurrentSerialInTransaction(txn_context,
                         store_oid, store_serial)
                 else:
                     if store_data is None:
                         # Some undo
                         logging.warning('Deadlock avoidance cannot reliably'
                             ' work with undo, this must be implemented.')
                         conflict_serial = ZERO_TID
                         break
                     self._store(txn_context, store_oid, store_serial,
                         store_data, unlock=True)
             else:
                 continue
         else:
             data = data_dict.pop(oid)
             if data is CHECKED_SERIAL:
                 raise ReadConflictError(oid=oid, serials=(conflict_serial,
                     serial))
             # TODO: data can be None if a conflict happens during undo
             if data:
                 txn_context['data_size'] -= len(data)
             resolved_serial_set = resolved_conflict_serial_dict.setdefault(
                 oid, set())
             if resolved_serial_set and conflict_serial <= max(
                     resolved_serial_set):
                 # A later serial has already been resolved, skip.
                 resolved_serial_set.update(conflict_serial_set)
                 continue
             try:
                 new_data = tryToResolveConflict(oid, conflict_serial,
                     serial, data)
             except ConflictError:
                 logging.info('Conflict resolution failed for '
                     '%r:%r with %r', dump(oid), dump(serial),
                     dump(conflict_serial))
             else:
                 logging.info('Conflict resolution succeeded for '
                     '%r:%r with %r', dump(oid), dump(serial),
                     dump(conflict_serial))
                 # Mark this conflict as resolved
                 resolved_serial_set.update(conflict_serial_set)
                 # Base serial changes too, as we resolved a conflict
                 object_base_serial_dict[oid] = conflict_serial
                 # Try to store again
                 self._store(txn_context, oid, conflict_serial, new_data)
                 append(oid)
                 continue
         # With recent ZODB, get_pickle_metadata (from ZODB.utils) does
         # not support empty values, so do not pass 'data' in this case.
         raise ConflictError(oid=oid, serials=(conflict_serial,
             serial), data=data or None)
     return result
Exemple #30
0
    def _import(self):
        p64 = util.p64
        u64 = util.u64
        tid = p64(self.zodb_tid + 1)
        zodb_list = []
        for zodb in self.zodb:
            try:
                zodb_list.append(ZODBIterator(zodb, tid, p64(self.zodb_ltid)))
            except StopIteration:
                pass
        tid = None

        def finish():
            if tid:
                self.storeTransaction(
                    tid, object_list,
                    ((x[0] for x in object_list), str(txn.user),
                     str(txn.description), cPickle.dumps(
                         txn.extension), False, tid), False)
                self.releaseData(data_id_list)
                logging.debug(
                    "TXN %s imported (user=%r, desc=%r, len(oid)=%s)",
                    util.dump(tid), txn.user, txn.description,
                    len(object_list))
                del object_list[:], data_id_list[:]
                if self._last_commit + 1 < time.time():
                    self.commit()
                self.zodb_tid = u64(tid)

        if self.compress:
            from zlib import compress
        else:
            compress = None
            compression = 0
        object_list = []
        data_id_list = []
        while zodb_list:
            zodb_list.sort()
            z = zodb_list[0]
            # Merge transactions with same tid. Only
            # user/desc/ext from first ZODB are kept.
            if tid != z.tid:
                finish()
                txn = z.transaction
                tid = txn.tid
                yield 1
            zodb = z.zodb
            for r in z.transaction:
                oid = p64(u64(r.oid) + zodb.shift_oid)
                data_tid = r.data_txn
                if data_tid or r.data is None:
                    data_id = None
                else:
                    data = zodb.repickle(r.data)
                    if compress:
                        compressed_data = compress(data)
                        compression = len(compressed_data) < len(data)
                        if compression:
                            data = compressed_data
                    checksum = util.makeChecksum(data)
                    data_id = self.holdData(util.makeChecksum(data), data,
                                            compression)
                    data_id_list.append(data_id)
                object_list.append((oid, data_id, data_tid))
                # Give the main loop the opportunity to process requests
                # from other nodes. In particular, clients may commit. If the
                # storage node exits after such commit, and before we actually
                # update 'obj' with 'object_list', some rows in 'data' may be
                # unreferenced. This is not a problem because the leak is
                # solved when resuming the migration.
                yield 1
            try:
                z.next()
            except StopIteration:
                del zodb_list[0]
        self._last_commit = 0
        finish()
        logging.warning(
            "All data are imported. You should change"
            " your configuration to use the native backend and restart.")
        self._import = None
        for x in """getObject getReplicationTIDList
                 """.split():
            setattr(self, x, getattr(self.db, x))
Exemple #31
0
    def playPrimaryRole(self):
        logging.info('play the primary role with %r', self.listening_conn)
        self.master_address_dict.clear()
        em = self.em
        packet = Packets.AnnouncePrimary()
        for conn in em.getConnectionList():
            if conn.isListening():
                conn.setHandler(identification.IdentificationHandler(self))
            else:
                conn.notify(packet)
                # Primary master should rather establish connections to all
                # secondaries, rather than the other way around. This requires
                # a bit more work when a new master joins a cluster but makes
                # it easier to resolve UUID conflicts with minimal cluster
                # impact, and ensure primary master unicity (primary masters
                # become noisy, in that they actively try to maintain
                # connections to all other master nodes, so duplicate
                # primaries will eventually get in touch with each other and
                # resolve the situation with a duel).
                # TODO: only abort client connections, don't close server
                # connections as we want to have them in the end. Secondary
                # masters will reconnect nevertheless, but it's dirty.
                # Currently, it's not trivial to preserve connected nodes,
                # because of poor node status tracking during election.
                conn.abort()

        # If I know any storage node, make sure that they are not in the
        # running state, because they are not connected at this stage.
        for node in self.nm.getStorageList():
            if node.isRunning():
                node.setTemporarilyDown()

        if self.uuid is None:
            self.uuid = self.getNewUUID(None, self.server, NodeTypes.MASTER)
            logging.info('My UUID: ' + uuid_str(self.uuid))
        else:
            in_conflict = self.nm.getByUUID(self.uuid)
            if in_conflict is not None:
                logging.warning('UUID conflict at election exit with %r',
                    in_conflict)
                in_conflict.setUUID(None)

        # Do not restart automatically if ElectionFailure is raised, in order
        # to avoid a split of the database. For example, with 2 machines with
        # a master and a storage on each one and replicas=1, the secondary
        # master becomes primary in case of network failure between the 2
        # machines but must not start automatically: otherwise, each storage
        # node would diverge.
        self._startup_allowed = False
        try:
            while True:
                self.runManager(RecoveryManager)
                try:
                    self.runManager(VerificationManager)
                    if not self.backup_tid:
                        self.provideService()
                        # self.provideService only returns without raising
                        # when switching to backup mode.
                    if self.backup_app is None:
                        raise RuntimeError("No upstream cluster to backup"
                                           " defined in configuration")
                    truncate = Packets.Truncate(
                        self.backup_app.provideService())
                except StoppedOperation, e:
                    logging.critical('No longer operational')
                    truncate = Packets.Truncate(*e.args) if e.args else None
                    # Automatic restart except if we truncate or retry to.
                    self._startup_allowed = not (self.truncate_tid or truncate)
                node_list = []
                for node in self.nm.getIdentifiedList():
                    if node.isStorage() or node.isClient():
                        conn = node.getConnection()
                        conn.notify(Packets.StopOperation())
                        if node.isClient():
                            conn.abort()
                            continue
                        if truncate:
                            conn.notify(truncate)
                        if node.isRunning():
                            node.setPending()
                            node_list.append(node)
                self.broadcastNodesInformation(node_list)
        except StateChangedException, e:
            assert e.args[0] == ClusterStates.STOPPING
            self.shutdown()
Exemple #32
0
 def _handleConflicts(self, txn_context, tryToResolveConflict):
     result = []
     append = result.append
     # Check for conflicts
     data_dict = txn_context['data_dict']
     object_base_serial_dict = txn_context['object_base_serial_dict']
     object_serial_dict = txn_context['object_serial_dict']
     conflict_serial_dict = txn_context['conflict_serial_dict'].copy()
     txn_context['conflict_serial_dict'].clear()
     resolved_conflict_serial_dict = txn_context[
         'resolved_conflict_serial_dict']
     for oid, conflict_serial_set in conflict_serial_dict.iteritems():
         conflict_serial = max(conflict_serial_set)
         serial = object_serial_dict[oid]
         if ZERO_TID in conflict_serial_set:
             if 1:
                 # XXX: disable deadlock avoidance code until it is fixed
                 logging.info('Deadlock avoidance on %r:%r', dump(oid),
                              dump(serial))
                 # 'data' parameter of ConflictError is only used to report the
                 # class of the object. It doesn't matter if 'data' is None
                 # because the transaction is too big.
                 try:
                     data = data_dict[oid]
                 except KeyError:
                     data = txn_context['cache_dict'][oid]
             else:
                 # Storage refused us from taking object lock, to avoid a
                 # possible deadlock. TID is actually used for some kind of
                 # "locking priority": when a higher value has the lock,
                 # this means we stored objects "too late", and we would
                 # otherwise cause a deadlock.
                 # To recover, we must ask storages to release locks we
                 # hold (to let possibly-competing transactions acquire
                 # them), and requeue our already-sent store requests.
                 # XXX: currently, brute-force is implemented: we send
                 # object data again.
                 # WARNING: not maintained code
                 logging.info('Deadlock avoidance triggered on %r:%r',
                              dump(oid), dump(serial))
                 for store_oid, store_data in data_dict.iteritems():
                     store_serial = object_serial_dict[store_oid]
                     if store_data is CHECKED_SERIAL:
                         self._checkCurrentSerialInTransaction(
                             txn_context, store_oid, store_serial)
                     else:
                         if store_data is None:
                             # Some undo
                             logging.warning(
                                 'Deadlock avoidance cannot reliably'
                                 ' work with undo, this must be implemented.'
                             )
                             conflict_serial = ZERO_TID
                             break
                         self._store(txn_context,
                                     store_oid,
                                     store_serial,
                                     store_data,
                                     unlock=True)
                 else:
                     continue
         else:
             data = data_dict.pop(oid)
             if data is CHECKED_SERIAL:
                 raise ReadConflictError(oid=oid,
                                         serials=(conflict_serial, serial))
             # TODO: data can be None if a conflict happens during undo
             if data:
                 txn_context['data_size'] -= len(data)
             resolved_serial_set = resolved_conflict_serial_dict.setdefault(
                 oid, set())
             if resolved_serial_set and conflict_serial <= max(
                     resolved_serial_set):
                 # A later serial has already been resolved, skip.
                 resolved_serial_set.update(conflict_serial_set)
                 continue
             try:
                 new_data = tryToResolveConflict(oid, conflict_serial,
                                                 serial, data)
             except ConflictError:
                 logging.info(
                     'Conflict resolution failed for '
                     '%r:%r with %r', dump(oid), dump(serial),
                     dump(conflict_serial))
             else:
                 logging.info(
                     'Conflict resolution succeeded for '
                     '%r:%r with %r', dump(oid), dump(serial),
                     dump(conflict_serial))
                 # Mark this conflict as resolved
                 resolved_serial_set.update(conflict_serial_set)
                 # Base serial changes too, as we resolved a conflict
                 object_base_serial_dict[oid] = conflict_serial
                 # Try to store again
                 self._store(txn_context, oid, conflict_serial, new_data)
                 append(oid)
                 continue
         # With recent ZODB, get_pickle_metadata (from ZODB.utils) does
         # not support empty values, so do not pass 'data' in this case.
         raise ConflictError(oid=oid,
                             serials=(conflict_serial, serial),
                             data=data or None)
     return result
Exemple #33
0
 def pack(self, t, referencesf, gc=False):
     if gc:
         logging.warning('Garbage Collection is not available in NEO,'
             ' please use an external tool. Packing without GC.')
     self.app.pack(t)
Exemple #34
0
    def playPrimaryRole(self):
        logging.info('play the primary role with %r', self.listening_conn)
        self.master_address_dict.clear()
        em = self.em
        packet = Packets.AnnouncePrimary()
        for conn in em.getConnectionList():
            if conn.isListening():
                conn.setHandler(identification.IdentificationHandler(self))
            else:
                conn.notify(packet)
                # Primary master should rather establish connections to all
                # secondaries, rather than the other way around. This requires
                # a bit more work when a new master joins a cluster but makes
                # it easier to resolve UUID conflicts with minimal cluster
                # impact, and ensure primary master unicity (primary masters
                # become noisy, in that they actively try to maintain
                # connections to all other master nodes, so duplicate
                # primaries will eventually get in touch with each other and
                # resolve the situation with a duel).
                # TODO: only abort client connections, don't close server
                # connections as we want to have them in the end. Secondary
                # masters will reconnect nevertheless, but it's dirty.
                # Currently, it's not trivial to preserve connected nodes,
                # because of poor node status tracking during election.
                conn.abort()

        # If I know any storage node, make sure that they are not in the
        # running state, because they are not connected at this stage.
        for node in self.nm.getStorageList():
            if node.isRunning():
                node.setTemporarilyDown()

        if self.uuid is None:
            self.uuid = self.getNewUUID(None, self.server, NodeTypes.MASTER)
            logging.info('My UUID: ' + uuid_str(self.uuid))
        else:
            in_conflict = self.nm.getByUUID(self.uuid)
            if in_conflict is not None:
                logging.warning('UUID conflict at election exit with %r',
                                in_conflict)
                in_conflict.setUUID(None)

        # Do not restart automatically if ElectionFailure is raised, in order
        # to avoid a split of the database. For example, with 2 machines with
        # a master and a storage on each one and replicas=1, the secondary
        # master becomes primary in case of network failure between the 2
        # machines but must not start automatically: otherwise, each storage
        # node would diverge.
        self._startup_allowed = False
        try:
            while True:
                self.runManager(RecoveryManager)
                try:
                    self.runManager(VerificationManager)
                    if not self.backup_tid:
                        self.provideService()
                        # self.provideService only returns without raising
                        # when switching to backup mode.
                    if self.backup_app is None:
                        raise RuntimeError("No upstream cluster to backup"
                                           " defined in configuration")
                    truncate = Packets.Truncate(
                        self.backup_app.provideService())
                except StoppedOperation, e:
                    logging.critical('No longer operational')
                    truncate = Packets.Truncate(*e.args) if e.args else None
                    # Automatic restart except if we truncate or retry to.
                    self._startup_allowed = not (self.truncate_tid or truncate)
                node_list = []
                for node in self.nm.getIdentifiedList():
                    if node.isStorage() or node.isClient():
                        conn = node.getConnection()
                        conn.notify(Packets.StopOperation())
                        if node.isClient():
                            conn.abort()
                            continue
                        if truncate:
                            conn.notify(truncate)
                        if node.isRunning():
                            node.setPending()
                            node_list.append(node)
                self.broadcastNodesInformation(node_list)
        except StateChangedException, e:
            assert e.args[0] == ClusterStates.STOPPING
            self.shutdown()
Exemple #35
0
 def _import(self):
     p64 = util.p64
     u64 = util.u64
     tid = p64(self.zodb_tid + 1)
     zodb_list = []
     for zodb in self.zodb:
         try:
             zodb_list.append(ZODBIterator(zodb, tid, p64(self.zodb_ltid)))
         except StopIteration:
             pass
     tid = None
     def finish():
         if tid:
             self.storeTransaction(tid, object_list, (
                 (x[0] for x in object_list),
                 str(txn.user), str(txn.description),
                 cPickle.dumps(txn.extension), False, tid), False)
             self.releaseData(data_id_list)
             logging.debug("TXN %s imported (user=%r, desc=%r, len(oid)=%s)",
                 util.dump(tid), txn.user, txn.description, len(object_list))
             del object_list[:], data_id_list[:]
             if self._last_commit + 1 < time.time():
                 self.commit()
             self.zodb_tid = u64(tid)
     if self.compress:
         from zlib import compress
     else:
         compress = None
         compression = 0
     object_list = []
     data_id_list = []
     while zodb_list:
         zodb_list.sort()
         z = zodb_list[0]
         # Merge transactions with same tid. Only
         # user/desc/ext from first ZODB are kept.
         if tid != z.tid:
             finish()
             txn = z.transaction
             tid = txn.tid
             yield 1
         zodb = z.zodb
         for r in z.transaction:
             oid = p64(u64(r.oid) + zodb.shift_oid)
             data_tid = r.data_txn
             if data_tid or r.data is None:
                 data_id = None
             else:
                 data = zodb.repickle(r.data)
                 if compress:
                     compressed_data = compress(data)
                     compression = len(compressed_data) < len(data)
                     if compression:
                         data = compressed_data
                 checksum = util.makeChecksum(data)
                 data_id = self.holdData(util.makeChecksum(data), data,
                                         compression)
                 data_id_list.append(data_id)
             object_list.append((oid, data_id, data_tid))
             # Give the main loop the opportunity to process requests
             # from other nodes. In particular, clients may commit. If the
             # storage node exits after such commit, and before we actually
             # update 'obj' with 'object_list', some rows in 'data' may be
             # unreferenced. This is not a problem because the leak is
             # solved when resuming the migration.
             yield 1
         try:
             z.next()
         except StopIteration:
             del zodb_list[0]
     self._last_commit = 0
     finish()
     logging.warning("All data are imported. You should change"
         " your configuration to use the native backend and restart.")
     self._import = None
     for x in """getObject getReplicationTIDList
              """.split():
         setattr(self, x, getattr(self.db, x))
Exemple #36
0
 def provideService(self):
     logging.info('provide backup')
     poll = self.em.poll
     app = self.app
     pt = app.pt
     while True:
         app.changeClusterState(ClusterStates.STARTING_BACKUP)
         bootstrap = BootstrapManager(self, self.name, NodeTypes.CLIENT)
         # {offset -> node}
         self.primary_partition_dict = {}
         # [[tid]]
         self.tid_list = tuple([] for _ in xrange(pt.getPartitions()))
         try:
             while True:
                 for node in pt.getNodeSet(readable=True):
                     if not app.isStorageReady(node.getUUID()):
                         break
                 else:
                     break
                 poll(1)
             node, conn, uuid, num_partitions, num_replicas = \
                 bootstrap.getPrimaryConnection()
             try:
                 app.changeClusterState(ClusterStates.BACKINGUP)
                 del bootstrap, node
                 if num_partitions != pt.getPartitions():
                     raise RuntimeError("inconsistent number of partitions")
                 self.pt = PartitionTable(num_partitions, num_replicas)
                 conn.setHandler(BackupHandler(self))
                 conn.ask(Packets.AskNodeInformation())
                 conn.ask(Packets.AskPartitionTable())
                 conn.ask(Packets.AskLastTransaction())
                 # debug variable to log how big 'tid_list' can be.
                 self.debug_tid_count = 0
                 while True:
                     poll(1)
             except PrimaryFailure, msg:
                 logging.error('upstream master is down: %s', msg)
             finally:
                 app.backup_tid = pt.getBackupTid()
                 try:
                     conn.close()
                 except PrimaryFailure:
                     pass
                 try:
                     del self.pt
                 except AttributeError:
                     pass
         except StateChangedException, e:
             if e.args[0] != ClusterStates.STOPPING_BACKUP:
                 raise
             app.changeClusterState(*e.args)
             tid = app.backup_tid
             # Wait for non-primary partitions to catch up,
             # so that all UP_TO_DATE cells are really UP_TO_DATE.
             # XXX: Another possibility could be to outdate such cells, and
             #      they would be quickly updated at the beginning of the
             #      RUNNING phase. This may simplify code.
             # Any unfinished replication from upstream will be truncated.
             while pt.getBackupTid(min) < tid:
                 poll(1)
             last_tid = app.getLastTransaction()
             handler = EventHandler(app)
             if tid < last_tid:
                 assert tid != ZERO_TID
                 logging.warning("Truncating at %s (last_tid was %s)",
                     dump(app.backup_tid), dump(last_tid))
             else:
                 # We will do a dummy truncation, just to leave backup mode,
                 # so it's fine to start automatically if there's any
                 # missing storage.
                 # XXX: Consider using another method to leave backup mode,
                 #      at least when there's nothing to truncate. Because
                 #      in case of StoppedOperation during VERIFYING state,
                 #      this flag will be wrongly set to False.
                 app._startup_allowed = True
             # If any error happened before reaching this line, we'd go back
             # to backup mode, which is the right mode to recover.
             del app.backup_tid
             # Now back to RECOVERY...
             return tid
Exemple #37
0
 def connectionLost(self, conn, new_state):
     logging.warning('A connection was lost during identification')
Exemple #38
0
 def connectionLost(self, conn, new_state):
     logging.warning('A connection was lost during identification')