Example #1
0
def _block_write_demote(group_id, update_only):
    """Block and disable write access to the current master.
    """
    group = _server.Group.fetch(group_id)
    if not group:
        raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

    if not group.master:
        raise _errors.GroupError("Group (%s) does not have a master." %
                                 (group_id, ))

    master = _server.MySQLServer.fetch(group.master)
    assert(master.status in \
        (_server.MySQLServer.PRIMARY, _server.MySQLServer.FAULTY)
    )

    if master.status == _server.MySQLServer.PRIMARY:
        master.connect()
        master.mode = _server.MySQLServer.READ_ONLY
        master.status = _server.MySQLServer.SECONDARY
        _utils.set_read_only(master, True)

        if not update_only:
            _events.trigger_within_procedure(WAIT_SLAVES_DEMOTE, group_id,
                                             str(master.uuid))

    _set_group_master_replication(group, None, update_only)
Example #2
0
def _retrieve_group(group_id):
    """Return a Group object from an identifier.
    """
    group = _server.Group.fetch(group_id)
    if not group:
        raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))
    return group
def start_group_slaves(master_group_id):
    """Start the slave groups for the given master group. The
    method will be used in the events that requires, a group, that
    has registered slaves to start them. An example would be
    enable shard, enable shard requires that a group start all
    the slaves that are registered with it.

    :param master_group_id: The master group ID. The ID belongs to the master
                            whose slaves need to be started.
    """
    # Fetch the master group corresponding to the master group
    # ID.
    master_group = Group.fetch(master_group_id)
    if master_group is None:
        raise _errors.GroupError(GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \
                                                (master_group_id, ))

    # Setup replication with masters of the groups registered as master
    # groups. master_group.slave_group_ids contains the list of the group
    # IDs that are slaves to this master. Iterate through this list and start
    # replication with the registered slaves.
    for slave_group_id in master_group.slave_group_ids:
        slave_group = Group.fetch(slave_group_id)
        # Setup replication with the slave group.
        try:
            setup_group_replication(master_group_id, slave_group.group_id)
        except (_errors.GroupError, _errors.DatabaseError) as error:
            _LOGGER.warning(
                "Error while configuring group replication between "
                "(%s) and (%s): (%s).", master_group_id, slave_group.group_id,
                error)
Example #4
0
def _retrieve_server(server_id, group_id=None):
    """Return a MySQLServer object from a UUID or a HOST:PORT.
    """
    server = _server.MySQLServer.fetch(server_id)

    if not server:
        raise _errors.ServerError("Server (%s) does not exist." %
                                  (server_id, ))

    if not server.group_id:
        raise _errors.GroupError("Server (%s) does not belong to a group." %
                                 (server_id, ))

    if group_id is not None and group_id != server.group_id:
        raise _errors.GroupError("Group (%s) does not contain server (%s)." %
                                 (group_id, server_id))

    return server
Example #5
0
def _check_candidate_switch(group_id, slave_id):
    """Check if the candidate has all the features to become the new
    master.
    """
    allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    if not group.master:
        raise _errors.GroupError("Group (%s) does not contain a valid "
                                 "master. Please, run a promote or failover." %
                                 (group_id, ))

    slave = _retrieve_server(slave_id, group_id)
    slave.connect()

    if group.master == slave.uuid:
        raise _errors.ServerError("Candidate slave (%s) is already master." %
                                  (slave_id, ))

    master_issues, why_master_issues = _replication.check_master_issues(slave)
    if master_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason(s): (%s)." %
                                  (slave.uuid, why_master_issues))

    slave_issues, why_slave_issues = _replication.check_slave_issues(slave)
    if slave_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason: (%s)." %
                                  (slave.uuid, why_slave_issues))

    master_uuid = _replication.slave_has_master(slave)
    if master_uuid is None or group.master != _uuid.UUID(master_uuid):
        raise _errors.GroupError(
            "The group's master (%s) is different from the candidate's "
            "master (%s)." % (group.master, master_uuid))

    if slave.status not in allowed_status:
        raise _errors.ServerError("Server (%s) is faulty." % (slave_id, ))

    _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid,
                                     str(slave.uuid))
Example #6
0
def _check_group_dependencies(group):
    """Check whether there is a shard associated with the group.
    """
    group_id = group.group_id

    shard_id = _sharding.Shards.lookup_shard_id(group_id)
    if shard_id:
        raise _errors.GroupError(
            "Cannot destroy a group (%s) which is associated to a shard (%s)."
            % (group_id, shard_id))

    shard_mapping_id = _sharding.ShardMapping.lookup_shard_mapping_id(group_id)
    if shard_mapping_id:
        raise _errors.GroupError(
            "Cannot destroy a group (%s) which is used as a global group in a "
            "shard definition (%s)." % (group_id, shard_mapping_id))

    if group.servers():
        raise _errors.GroupError(
            "Cannot destroy a group (%s) which has associated servers." %
            (group_id, ))
Example #7
0
def _destroy_group(group_id):
    """Destroy a group.
    """
    group = _retrieve_group(group_id)
    _check_group_dependencies(group)
    _detector.FailureDetector.unregister_group(group_id)
    try:
        _server.Group.remove(group)
    except _errors.DatabaseError as error:
        foreign_key_errors = (ER_ROW_IS_REFERENCED, ER_ROW_IS_REFERENCED_2)
        if error.errno in foreign_key_errors:
            raise _errors.GroupError(
                "Cannot destroy group (%s): %s." % (group_id, error, )
            )
        raise
    _LOGGER.debug("Destroyed group (%s).", group)
Example #8
0
def _define_ha_operation(group_id, slave_id, update_only):
    """Define which operation must be called based on the master's status
    and whether the candidate slave is provided or not.
    """
    fail_over = True

    group = _server.Group.fetch(group_id)
    if not group:
        raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

    if update_only and not slave_id:
        raise _errors.ServerError(
            "The new master must be specified through --slave-uuid if "
            "--update-only is set.")

    if group.master:
        master = _server.MySQLServer.fetch(group.master)
        if master.status != _server.MySQLServer.FAULTY:
            if update_only:
                _do_block_write_master(group_id, str(group.master),
                                       update_only)
            fail_over = False

    if update_only:
        # Check whether the server is registered or not.
        _retrieve_server(slave_id, group_id)
        _change_to_candidate(group_id, slave_id, update_only)
        return

    if fail_over:
        if not slave_id:
            _events.trigger_within_procedure(FIND_CANDIDATE_FAIL, group_id)
        else:
            _events.trigger_within_procedure(CHECK_CANDIDATE_FAIL, group_id,
                                             slave_id)
    else:
        if not slave_id:
            _events.trigger_within_procedure(FIND_CANDIDATE_SWITCH, group_id)
        else:
            _events.trigger_within_procedure(CHECK_CANDIDATE_SWITCH, group_id,
                                             slave_id)
Example #9
0
    def execute(self, group_id):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error', 'gtid_executed'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status
            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }
            try:
                # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT.
                if server.is_alive:
                  server.connect()
                  alive = True
                  if not is_master:
                      slave_issues, why_slave_issues = \
                          _replication.check_slave_issues(server)
                      str_master_uuid = _replication.slave_has_master(server)
                      if (group.master is None or str(group.master) != \
                          str_master_uuid) and not slave_issues:
                          issues.append_row([
                              "Group has master (%s) but server is connected " \
                              "to master (%s)." % \
                              (group.master, str_master_uuid)
                          ])
                  gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED
                else:
                  status = _server.MySQLServer.FAULTY
                  gtid_executed= "UNKNOWN"
                  
            except _errors.DatabaseError:
                status = _server.MySQLServer.FAULTY
                gtid_executed= "UNKNOWN"

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
                ' '.join(gtid_executed.splitlines()),
            ])

        return CommandResult(None, results=[info, issues])
Example #10
0
def _do_find_candidate(group_id, event):
    """Find the best candidate in a group that may be used to replace the
    current master if there is any.

    It chooses the slave that has processed more transactions and may become a
    master, e.g. has the binary log enabled. This function does not consider
    purged transactions and delays in the slave while picking up a slave.

    :param group_id: Group's id from where a candidate will be chosen.
    :return: Return the uuid of the best candidate to become a master in the
             group.
    """
    forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    master_uuid = None
    if group.master:
        master_uuid = str(group.master)

    chosen_uuid = None
    chosen_gtid_status = None
    for candidate in group.servers():
        if master_uuid != str(candidate.uuid) and \
            candidate.status not in forbidden_status:
            try:
                candidate.connect()
                gtid_status = candidate.get_gtid_status()
                master_issues, why_master_issues = \
                    _replication.check_master_issues(candidate)
                slave_issues = False
                why_slave_issues = {}
                if event == FIND_CANDIDATE_SWITCH:
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(candidate)
                has_valid_master = (master_uuid is None or \
                    _replication.slave_has_master(candidate) == master_uuid)
                can_become_master = False
                if chosen_gtid_status:
                    n_trans = 0
                    try:
                        n_trans = _replication.get_slave_num_gtid_behind(
                            candidate, chosen_gtid_status)
                    except _errors.InvalidGtidError:
                        pass
                    if n_trans == 0 and not master_issues and \
                        has_valid_master and not slave_issues:
                        chosen_gtid_status = gtid_status
                        chosen_uuid = str(candidate.uuid)
                        can_become_master = True
                elif not master_issues and has_valid_master and \
                    not slave_issues:
                    chosen_gtid_status = gtid_status
                    chosen_uuid = str(candidate.uuid)
                    can_become_master = True
                if not can_become_master:
                    _LOGGER.warning(
                        "Candidate (%s) cannot become a master due to the "
                        "following reasons: issues to become a "
                        "master (%s), prerequistes as a slave (%s), valid "
                        "master (%s).", candidate.uuid, why_master_issues,
                        why_slave_issues, has_valid_master)
            except _errors.DatabaseError as error:
                _LOGGER.warning("Error accessing candidate (%s): %s.",
                                candidate.uuid, error)

    if not chosen_uuid:
        raise _errors.GroupError(
            "There is no valid candidate that can be automatically "
            "chosen in group (%s). Please, choose one manually." %
            (group_id, ))
    return chosen_uuid
Example #11
0
def _check_group_exists(group_id):
    """Check whether a group exists or not.
    """
    group = _server.Group.fetch(group_id)
    if group:
        raise _errors.GroupError("Group (%s) already exists." % (group_id, ))
def setup_group_replication(group_master_id, group_slave_id):
    """Sets up replication between the masters of the two groups and
    updates the references to the groups in each other.

    :param group_master_id: The group whose master will act as the master
                                             in the replication setup.
    :param group_slave_id: The group whose master will act as the slave in the
                                      replication setup.
    """
    group_master = Group.fetch(group_master_id)
    group_slave = Group.fetch(group_slave_id)

    if group_master is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_master_id, ))

    if group_slave is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_slave_id, ))

    if group_master.master is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "")

    if group_slave.master is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "")

    #Master is the master of the Global Group. We replicate from here to
    #the masters of all the shard Groups.
    master = MySQLServer.fetch(group_master.master)
    if master is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \
        (group_master.master, ))

    #Get the master of the shard Group.
    slave = MySQLServer.fetch(group_slave.master)
    if slave is None:
        raise _errors.GroupError \
        (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \
        (group_slave.master, ))

    if not server_running(master):
        #The server is already down. We cannot connect to it to setup
        #replication.
        raise _errors.GroupError \
        (GROUP_MASTER_NOT_RUNNING % (group_master.group_id, ))

    try:
        master.connect()
    except _errors.DatabaseError as error:
        #Server is not accessible, unable to connect to the server.
        raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR %
                                 (group_slave.master, error))

    if not server_running(slave):
        #The server is already down. We cannot connect to it to setup
        #replication.
        raise _errors.GroupError \
            (GROUP_MASTER_NOT_RUNNING % (group_slave.group_id, ))

    try:
        slave.connect()
    except _errors.DatabaseError as error:
        raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR %
                                 (group_master.master, error))

    _replication.stop_slave(slave, wait=True)

    #clear references to old masters in the slave
    _replication.reset_slave(slave, clean=True)

    _replication.switch_master(slave, master, master.user, master.passwd)

    _replication.start_slave(slave, wait=True)

    try:
        group_master.add_slave_group_id(group_slave_id)
        group_slave.add_master_group_id(group_master_id)
    except _errors.DatabaseError:
        #If there is an error while adding a reference to
        #the slave group or a master group, it means that
        #the slave group was already added and the error
        #is happening because the group was already registered.
        #Ignore this error.
        pass
Example #13
0
    def execute(self, group_id, timeout=None):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        :param group_id: Timeout value after which a server is considered
                         unreachable. If None is provided, it assumes the
                         default value in the configuration file.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        try:
            timeout = float(timeout)
        except (TypeError, ValueError):
            pass

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status

            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }

            try:
                alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT)
                if alive and not is_master:
                    server.connect()
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(server)
                    str_master_uuid = _replication.slave_has_master(server)
                    if (group.master is None or str(group.master) != \
                        str_master_uuid) and not slave_issues:
                        issues.append_row([
                            "Group has master (%s) but server is connected " \
                            "to master (%s)." % \
                            (group.master, str_master_uuid)
                        ])
            except _errors.DatabaseError:
                alive = False

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
            ])

        return CommandResult(None, results=[info, issues])