def _check_candidate_switch(group_id, slave_id):
    """Check if the candidate has all the features to become the new
    master.
    """
    allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    if not group.master:
        raise _errors.GroupError(
            "Group (%s) does not contain a valid "
            "master. Please, run a promote or failover." % (group_id, )
        )

    slave = _retrieve_server(slave_id, group_id)
    slave.connect()

    if group.master == slave.uuid:
        raise _errors.ServerError(
            "Candidate slave (%s) is already master." % (slave_id, )
            )

    master_issues = _replication.check_master_issues(slave)
    if master_issues:
        raise _errors.ServerError(
            "Server (%s) is not a valid candidate slave "
            "due to the following reason(s): (%s)." %
            (slave.uuid, master_issues)
            )

    slave_issues = _replication.check_slave_issues(slave)
    if slave_issues:
        raise _errors.ServerError(
            "Server (%s) is not a valid candidate slave "
            "due to the following reason: (%s)." %
            (slave.uuid, slave_issues)
            )

    master_uuid = _replication.slave_has_master(slave)
    if master_uuid is None or group.master != _uuid.UUID(master_uuid):
        raise _errors.GroupError(
            "The group's master (%s) is different from the candidate's "
            "master (%s)." % (group.master, master_uuid)
            )

    if slave.status not in allowed_status:
        raise _errors.ServerError("Server (%s) is faulty." % (slave_id, ))

    _events.trigger_within_procedure(
        BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid)
        )
Example #2
0
def _check_candidate_switch(group_id, slave_id):
    """Check if the candidate has all the features to become the new
    master.
    """
    allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    if not group.master:
        raise _errors.GroupError("Group (%s) does not contain a valid "
                                 "master. Please, run a promote or failover." %
                                 (group_id, ))

    slave = _retrieve_server(slave_id, group_id)
    slave.connect()

    if group.master == slave.uuid:
        raise _errors.ServerError("Candidate slave (%s) is already master." %
                                  (slave_id, ))

    master_issues, why_master_issues = _replication.check_master_issues(slave)
    if master_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason(s): (%s)." %
                                  (slave.uuid, why_master_issues))

    slave_issues, why_slave_issues = _replication.check_slave_issues(slave)
    if slave_issues:
        raise _errors.ServerError("Server (%s) is not a valid candidate slave "
                                  "due to the following reason: (%s)." %
                                  (slave.uuid, why_slave_issues))

    master_uuid = _replication.slave_has_master(slave)
    if master_uuid is None or group.master != _uuid.UUID(master_uuid):
        raise _errors.GroupError(
            "The group's master (%s) is different from the candidate's "
            "master (%s)." % (group.master, master_uuid))

    if slave.status not in allowed_status:
        raise _errors.ServerError("Server (%s) is faulty." % (slave_id, ))

    _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid,
                                     str(slave.uuid))
Example #3
0
def _health(group_id):
    """Check which servers in a group are up and down.
    """
    availability = {}

    group = _server.Group.fetch(group_id)
    if not group:
        raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

    for server in group.servers():
        alive = False
        is_master = (group.master == server.uuid)
        thread_issues = {}
        status = server.status
        try:
            server.connect()
            alive = True
            if not is_master:
                slave_issues = _replication.check_slave_issues(server)
                str_master_uuid = _replication.slave_has_master(server)
                if (group.master is None or str(group.master) != \
                    str_master_uuid) and not slave_issues:
                    thread_issues = \
                        "Group has master (%s) but server is connected " \
                        "to master (%s)." % \
                        (group.master, str_master_uuid)
                elif slave_issues:
                    thread_issues = slave_issues
        except _errors.DatabaseError:
            status = _server.MySQLServer.FAULTY
        availability[str(server.uuid)] = {
            "is_alive" : alive,
            "status" : status,
            "threads" : thread_issues
            }

    return availability
Example #4
0
    def execute(self, group_id):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error', 'gtid_executed'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status
            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }
            try:
                # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT.
                if server.is_alive:
                  server.connect()
                  alive = True
                  if not is_master:
                      slave_issues, why_slave_issues = \
                          _replication.check_slave_issues(server)
                      str_master_uuid = _replication.slave_has_master(server)
                      if (group.master is None or str(group.master) != \
                          str_master_uuid) and not slave_issues:
                          issues.append_row([
                              "Group has master (%s) but server is connected " \
                              "to master (%s)." % \
                              (group.master, str_master_uuid)
                          ])
                  gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED
                else:
                  status = _server.MySQLServer.FAULTY
                  gtid_executed= "UNKNOWN"
                  
            except _errors.DatabaseError:
                status = _server.MySQLServer.FAULTY
                gtid_executed= "UNKNOWN"

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
                ' '.join(gtid_executed.splitlines()),
            ])

        return CommandResult(None, results=[info, issues])
Example #5
0
def _do_find_candidate(group_id, event):
    """Find the best candidate in a group that may be used to replace the
    current master if there is any.

    It chooses the slave that has processed more transactions and may become a
    master, e.g. has the binary log enabled. This function does not consider
    purged transactions and delays in the slave while picking up a slave.

    :param group_id: Group's id from where a candidate will be chosen.
    :return: Return the uuid of the best candidate to become a master in the
             group.
    """
    forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    master_uuid = None
    if group.master:
        master_uuid = str(group.master)

    chosen_uuid = None
    chosen_gtid_status = None
    for candidate in group.servers():
        if master_uuid != str(candidate.uuid) and \
            candidate.status not in forbidden_status:
            try:
                candidate.connect()
                gtid_status = candidate.get_gtid_status()
                master_issues, why_master_issues = \
                    _replication.check_master_issues(candidate)
                slave_issues = False
                why_slave_issues = {}
                if event == FIND_CANDIDATE_SWITCH:
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(candidate)
                has_valid_master = (master_uuid is None or \
                    _replication.slave_has_master(candidate) == master_uuid)
                can_become_master = False
                if chosen_gtid_status:
                    n_trans = 0
                    try:
                        n_trans = _replication.get_slave_num_gtid_behind(
                            candidate, chosen_gtid_status)
                    except _errors.InvalidGtidError:
                        pass
                    if n_trans == 0 and not master_issues and \
                        has_valid_master and not slave_issues:
                        chosen_gtid_status = gtid_status
                        chosen_uuid = str(candidate.uuid)
                        can_become_master = True
                elif not master_issues and has_valid_master and \
                    not slave_issues:
                    chosen_gtid_status = gtid_status
                    chosen_uuid = str(candidate.uuid)
                    can_become_master = True
                if not can_become_master:
                    _LOGGER.warning(
                        "Candidate (%s) cannot become a master due to the "
                        "following reasons: issues to become a "
                        "master (%s), prerequistes as a slave (%s), valid "
                        "master (%s).", candidate.uuid, why_master_issues,
                        why_slave_issues, has_valid_master)
            except _errors.DatabaseError as error:
                _LOGGER.warning("Error accessing candidate (%s): %s.",
                                candidate.uuid, error)

    if not chosen_uuid:
        raise _errors.GroupError(
            "There is no valid candidate that can be automatically "
            "chosen in group (%s). Please, choose one manually." %
            (group_id, ))
    return chosen_uuid
    def _run(self):
        """Function that verifies servers' availabilities.
        """
        from mysql.fabric.server import (
            Group,
            MySQLServer,
            ConnectionManager,
        )

        ignored_status = [MySQLServer.FAULTY]
        quarantine = {}
        interval = FailureDetector._DETECTION_INTERVAL
        detections = FailureDetector._DETECTIONS
        detection_timeout = FailureDetector._DETECTION_TIMEOUT
        connection_manager = ConnectionManager()
        slave_deep_checks = FailureDetector._SLAVE_DEEP_CHECKS

        _persistence.init_thread()

        while self.__check:
            try:
                unreachable = set()
                group = Group.fetch(self.__group_id)
                if group is not None:
                    for server in group.servers():
                        if server.status in ignored_status:

                            ### Server is FAULTY
                            connection_manager.kill_connections(server)
                            continue
                        else:
                            ### Server is Not FAULTY
                            if MySQLServer.is_alive(server, detection_timeout):

                                ### Server is alive
                                ### check depends on `slave_deep_checks` parameter
                                if slave_deep_checks:

                                    ### When server is alive and status != FAULTY
                                    is_master= (group.master == server.uuid)
                                    if not is_master:
                                        ### Checking master is dead or alive.
                                        master_server = MySQLServer.fetch(group.master)
    
                                        if MySQLServer.is_alive(master_server, detection_timeout):
    
                                            ### Checking is replication valid or not if master is alive.
                                            server.connect()
                                            slave_issues, why_slave_issues = \
                                                _replication.check_slave_issues(server)
                                            if slave_issues:
        
                                                if (why_slave_issues['io_error'] and \
                                                    why_slave_issues['io_errno'] == 2003):
        
                                                    ### Nothing to do during reconnecting, just logging
                                                    _LOGGER.info(why_slave_issues)
        
                                                else:
                                                        
                                                    ### If slave threads are not running, set status to SPARE
                                                    server.status = MySQLServer.SPARE
        
                                            ### Done slave_issues.
                                            server.disconnect()
    
                                        ### Endif MySQLServer.is_alive(master_server, detection_timeout)
                                    ### Endif not is_master
                                ### Endif slave_deep_checks
                                continue
                            ### Else MySQLServer.is_alive(server, detection_timeout)
                            else:

                                unreachable.add(server.uuid)

                                _LOGGER.warning(
                                    "Server (%s) in group (%s) is unreachable.",
                                    server.uuid, self.__group_id
                                )
        
                                unstable = False
                                failed_attempts = 0
                                if server.uuid not in quarantine:
                                    quarantine[server.uuid] = failed_attempts = 1
                                else:
                                    failed_attempts = quarantine[server.uuid] + 1
                                    quarantine[server.uuid] = failed_attempts
                                if failed_attempts >= detections:
                                    unstable = True
        
                                can_set_faulty = group.can_set_server_faulty(
                                    server, get_time()
                                )
                                if unstable and can_set_faulty:
                                    # We have to make this transactional and make the
                                    # failover (i.e. report failure) robust to failures.
                                    # Otherwise, a master might be set to faulty and
                                    # a new one never promoted.
                                    server.status = MySQLServer.FAULTY
                                    connection_manager.kill_connections(server)
                                    
                                    procedures = trigger("REPORT_FAILURE", None,
                                        str(server.uuid),
                                        threading.current_thread().name,
                                        MySQLServer.FAULTY, False
                                    )
                                    executor = _executor.Executor()
                                    for procedure in procedures:
                                        executor.wait_for_procedure(procedure)

                            ### Endif MySQLServer.is_alive(server, detection_timeout)
                        ### Endif server.status in ignored_status
                    ### End for server in group.servers()
                ### Endif group is not None
                for uuid in quarantine.keys():
                    if uuid not in unreachable:
                        del quarantine[uuid]

            except (_errors.ExecutorError, _errors.DatabaseError):
                pass
            except Exception as error:
                _LOGGER.exception(error)

            time.sleep(interval)

        _persistence.deinit_thread()
def _do_find_candidate(group_id, event):
    """Find the best candidate in a group that may be used to replace the
    current master if there is any.

    It chooses the slave that has processed more transactions and may become a
    master, e.g. has the binary log enabled. This function does not consider
    purged transactions and delays in the slave while picking up a slave.

    :param group_id: Group's id from where a candidate will be chosen.
    :return: Return the uuid of the best candidate to become a master in the
             group.
    """
    forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE)
    group = _server.Group.fetch(group_id)

    master_uuid = None
    if group.master:
        master_uuid = str(group.master)

    chosen_uuid = None
    chosen_gtid_status = None
    for candidate in group.servers():
        if master_uuid != str(candidate.uuid) and \
            candidate.status not in forbidden_status:
            try:
                candidate.connect()
                gtid_status = candidate.get_gtid_status()
                master_issues = \
                    _replication.check_master_issues(candidate)
                if event == FIND_CANDIDATE_SWITCH:
                    slave_issues = \
                        _replication.check_slave_issues(candidate)
                else:
                    slave_issues = {}
                has_valid_master = (master_uuid is None or \
                    _replication.slave_has_master(candidate) == master_uuid)
                can_become_master = False
                if chosen_gtid_status:
                    n_trans = 0
                    try:
                        n_trans = _replication.get_slave_num_gtid_behind(
                            candidate, chosen_gtid_status
                            )
                    except _errors.InvalidGtidError:
                        pass
                    if n_trans == 0 and not master_issues and \
                        has_valid_master and not slave_issues:
                        chosen_gtid_status = gtid_status
                        chosen_uuid = str(candidate.uuid)
                        can_become_master = True
                elif not master_issues and has_valid_master and \
                    not slave_issues:
                    chosen_gtid_status = gtid_status
                    chosen_uuid = str(candidate.uuid)
                    can_become_master = True
                if not can_become_master:
                    _LOGGER.warning(
                        "Candidate (%s) cannot become a master due to the "
                        "following reasons: issues to become a "
                        "master (%s), prerequistes as a slave (%s), valid "
                        "master (%s).", candidate.uuid, master_issues,
                        slave_issues, has_valid_master
                        )
            except _errors.DatabaseError as error:
                _LOGGER.warning(
                    "Error accessing candidate (%s).", candidate.uuid,
                    exc_info=error
                )

    if not chosen_uuid:
        raise _errors.GroupError(
            "There is no valid candidate that can be automatically "
            "chosen in group (%s). Please, choose one manually." %
            (group_id, )
            )
    return chosen_uuid
Example #8
0
    def execute(self, group_id, timeout=None):
        """Check if any server within a group has failed.

        :param group_id: Group's id.
        :param group_id: Timeout value after which a server is considered
                         unreachable. If None is provided, it assumes the
                         default value in the configuration file.
        """

        group = _server.Group.fetch(group_id)
        if not group:
            raise _errors.GroupError("Group (%s) does not exist." % (group_id, ))

        info = ResultSet(
            names=[
                'uuid', 'is_alive', 'status',
                'is_not_running', 'is_not_configured', 'io_not_running',
                'sql_not_running', 'io_error', 'sql_error'
            ],
            types=[str, bool, str] + [bool] * 4 + [str, str]
        )
        issues = ResultSet(names=['issue'], types=[str])

        try:
            timeout = float(timeout)
        except (TypeError, ValueError):
            pass

        for server in group.servers():
            alive = False
            is_master = (group.master == server.uuid)
            status = server.status

            why_slave_issues = {}
            # These are used when server is not contactable.
            why_slave_issues = {
                'is_not_running': False,
                'is_not_configured': False,
                'io_not_running': False,
                'sql_not_running': False,
                'io_error': False,
                'sql_error': False,
            }

            try:
                alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT)
                if alive and not is_master:
                    server.connect()
                    slave_issues, why_slave_issues = \
                        _replication.check_slave_issues(server)
                    str_master_uuid = _replication.slave_has_master(server)
                    if (group.master is None or str(group.master) != \
                        str_master_uuid) and not slave_issues:
                        issues.append_row([
                            "Group has master (%s) but server is connected " \
                            "to master (%s)." % \
                            (group.master, str_master_uuid)
                        ])
            except _errors.DatabaseError:
                alive = False

            info.append_row([
                server.uuid,
                alive,
                status,
                why_slave_issues['is_not_running'],
                why_slave_issues['is_not_configured'],
                why_slave_issues['io_not_running'],
                why_slave_issues['sql_not_running'],
                why_slave_issues['io_error'],
                why_slave_issues['sql_error'],
            ])

        return CommandResult(None, results=[info, issues])