Example #1
0
def sanity_check_replica(instance):
    """ Make sure a slave is slaving and relatively caught up

    Args:
    instance - A hostaddr object

    Returns:
    A hostaddr object of master of the instance argument
    """
    # Test to see if the slave is setup for replication. If not, we are hosed
    conn = mysql_lib.connect_mysql(instance)
    try:
        mysql_lib.get_master_status(conn)
    except mysql_lib.ReplicationError:
        raise Exception('{instance} is not setup to write replicaiton '
                        'logs!'.format(instance=instance))

    replication = mysql_lib.calc_slave_lag(instance)
    if replication['ss']['Slave_SQL_Running'] != 'Yes':
        raise Exception('SQL thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['ss']['Slave_IO_Running'] != 'Yes':
        raise Exception('IO thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG:
        raise Exception('Heartbeat lag {sbm} > {max_lag} seconds'
                        ''.format(sbm=replication['sbm'],
                                  max_lag=mysql_lib.MAX_HEARTBEAT_LAG))

    if replication['io_bytes'] > mysql_lib.MAX_IO_LAG:
        raise Exception('IO lag {io_bytes} > {max_io} bytes'
                        ''.format(io_bytes=replication['io_bytes'],
                                  max_io=mysql_lib.MAX_IO_LAG))
    master = host_utils.HostAddr(':'.join(
        (replication['ss']['Master_Host'],
         str(replication['ss']['Master_Port']))))
    return master
Example #2
0
def sanity_check_replica(instance):
    """ Make sure a slave is slaving and relatively caught up

    Args:
    instance - A hostaddr object

    Returns:
    A hostaddr object of master of the instance argument
    """
    # Test to see if the slave is setup for replication. If not, we are hosed
    conn = mysql_lib.connect_mysql(instance)
    try:
        mysql_lib.get_master_status(conn)
    except mysql_lib.ReplicationError:
        raise Exception('{instance} is not setup to write replicaiton '
                        'logs!'.format(instance=instance))

    replication = mysql_lib.calc_slave_lag(instance)
    if replication['ss']['Slave_SQL_Running'] != 'Yes':
        raise Exception('SQL thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['ss']['Slave_IO_Running'] != 'Yes':
        raise Exception('IO thread is not running on {instance}'
                        ''.format(instance=instance))

    if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG:
        raise Exception('Heartbeat lag {sbm} > {max_lag} seconds'
                        ''.format(sbm=replication['sbm'],
                                  max_lag=mysql_lib.MAX_HEARTBEAT_LAG))

    if replication['io_bytes'] > mysql_lib.MAX_IO_LAG:
        raise Exception('IO lag {io_bytes} > {max_io} bytes'
                        ''.format(io_bytes=replication['io_bytes'],
                                  max_io=mysql_lib.MAX_IO_LAG))
    master = host_utils.HostAddr(':'.join((replication['ss']['Master_Host'],
                                           str(replication['ss']['Master_Port']))))
    return master
def rotate_binlogs_if_needed(port, dry_run):
    instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, str(port))))
    log_bin_dir = host_utils.get_cnf_setting('log_bin', port)
    binlog = os.path.join(os.path.dirname(log_bin_dir),
                          mysql_lib.get_master_status(instance)['File'])
    # We don't update access time, so this is creation time.
    creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime)
    age = (datetime.datetime.utcnow() - creation).seconds
    if age > MAX_AGE:
        log.info('Age of current binlog is {age} which is greater than '
                 'MAX_AGE ({MAX_AGE})'.format(age=age, MAX_AGE=MAX_AGE))
        if not dry_run:
            log.info('Flushing bin log')
            mysql_lib.flush_master_log(instance)
    else:
        log.info('Age of current binlog is {age} which is less than '
                 'MAX_AGE ({MAX_AGE})'.format(age=age, MAX_AGE=MAX_AGE))
Example #4
0
def rotate_binlogs_if_needed(port, dry_run):
    instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME,
                                             str(port))))
    log_bin_dir = host_utils.get_cnf_setting('log_bin', port)
    binlog = os.path.join(os.path.dirname(log_bin_dir),
                          mysql_lib.get_master_status(instance)['File'])
    # We don't update access time, so this is creation time.
    creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime)
    age = (datetime.datetime.utcnow() - creation).seconds
    if age > MAX_AGE:
        log.info('Age of current binlog is {age} which is greater than '
                 ' MAX_AGE ({MAX_AGE})'.format(age=age,
                                               MAX_AGE=MAX_AGE))
        if not dry_run:
            log.info('Flushing bin log')
            mysql_lib.flush_master_log(instance)
    else:
        log.info('Age of current binlog is {age} which is less than '
                 ' MAX_AGE ({MAX_AGE})'.format(age=age,
                                               MAX_AGE=MAX_AGE))
Example #5
0
def mysql_failover(master, dry_run, skip_lock,
                   ignore_dr_slave, trust_me_its_dead, kill_old_master):
    """ Promte a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set,
                                                       repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # let's make sure that what we think is the master, actually is
        confirm_replica_topology(master, replicas)

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)
        slave_conn = mysql_lib.connect_mysql(slave)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        try:
            mysql_lib.get_master_status(slave_conn)
        except mysql_lib.ReplicationError:
            log.error('New master {slave} is not setup to write replicaiton '
                      'logs!'.format(slave=slave))
            raise
        log.info('Slave/new master is setup to write replication logs')

        if kill_old_master:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master_conn, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master_conn)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas, 0,
                                    timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
            log.info('Setting up replication from old master ({master})'
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave_conn)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming relpica servers in sync')
                confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                        replicas_synced=True,
                                        dead_master=dead_master)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master_conn, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master_conn)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt+1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave_conn, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave_conn)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave,
                   trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master,
                                                        rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(
            replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(
                replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info(
            'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        mysql_lib.get_master_status(slave)

        if kill_old_master and not dry_run:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(environment_specific.DRY_RUN_EXIT_CODE)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master, True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(replicas,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master, True)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt + 1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    # we don't really care if this fails, but we'll print a message anyway.
    try:
        environment_specific.generic_json_post(
            environment_specific.CHANGE_FEED_URL, {
                'type': 'MySQL Failover',
                'environment': replica_set,
                'description': "Failover from {m} to {s}".format(m=master,
                                                                 s=slave),
                'author': host_utils.get_user(),
                'automation': False,
                'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME)
            })
    except Exception as e:
        log.warning("Failover completed, but change feed "
                    "not updated: {}".format(e))

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
Example #7
0
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave,
                   trust_me_its_dead, kill_old_master):
    """ Promte a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master,
                                                        rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(
            replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(
                replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info(
            'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # let's make sure that what we think is the master, actually is
        confirm_replica_topology(master, replicas)

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)
        slave_conn = mysql_lib.connect_mysql(slave)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        try:
            mysql_lib.get_master_status(slave_conn)
        except mysql_lib.ReplicationError:
            log.error('New master {slave} is not setup to write replicaiton '
                      'logs!'.format(slave=slave))
            raise
        log.info('Slave/new master is setup to write replication logs')

        if kill_old_master:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master_conn, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master_conn)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    0,
                                    timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
            log.info('Setting up replication from old master ({master})'
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave_conn)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming relpica servers in sync')
                confirm_max_replica_lag(replicas,
                                        MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                        replicas_synced=True,
                                        dead_master=dead_master)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master_conn, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master_conn)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt + 1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave_conn, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave_conn)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
Example #8
0
def mysql_failover(master, dry_run, skip_lock,
                   trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """

    log.info('Master to demote is {master}'.format(master=master))

    slaves = host_utils.get_slaves_of_master(master)
    if not slaves:
        raise Exception('No slaves found.')

    new_master = slaves[0]
    replicas = slaves[1:]

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on {new_master}'.format(new_master=new_master))
        cluster = host_utils.get_cluster_name(new_master)
        lock_identifier = get_promotion_lock(cluster)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        log.info('New master is detected as {slave}'.format(slave=new_master))

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, slaves)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if New master {new_master} is setup to write '
                 'replication logs'.format(new_master=new_master))
        mysql_lib.get_master_status(new_master)

        if kill_old_master and not dry_run:
            log.info('Killing old master {}, we hope you know what you are doing'.format(master))
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master {} is considered alive'.format(master))
            dead_master = False
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master {} is considered dead'.format(master))
            dead_master = True
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master {}'.format(master))
            mysql_lib.set_global_read_only(master)
            log.info('Confirming no writes to old master {}'.format(master))
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master,
                                    True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=new_master))
            mysql_lib.setup_replication(new_master=new_master, new_replica=master)
        else:
            log.info('Confirming replica/new master {} has processed all replication '
                     ' logs'.format(new_master))
            confirm_no_writes(new_master)
            log.info('Looks like no writes being processed by replica {} via '
                     'replication or other means'.format(new_master))
            if len(slaves) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(slaves,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master,
                                        True)
    except:
        log.info('Starting rollback')
        try:
            if master_conn:
                log.info('Releasing read_only on old master {}'.format(master))
                mysql_lib.unset_global_read_only(master)

                log.info('Clearing replication settings on old master {}'.format(master))
                mysql_lib.reset_slave(master)
        finally:
            if lock_identifier:
                log.info('Releasing promotion lock')
                release_promotion_lock(lock_identifier)
            log.info('Rollback complete, reraising exception')
        raise

    failed_replicas = []
    for s in replicas:
        log.info('Setting up replication {replication} from old master ({master}) '
         'to new master ({slave})'.format(master=master,
                                          slave=new_master,
                                          replication=s))
        try:
            mysql_lib.setup_replication(new_master=new_master, new_replica=s)
        except Exception as e:
            failed_replicas.append(s)
            log.error(e)
            log.error('Setting up replication on the slave {} failed. Failing forward!'.format(s))

    log.info('Updating config')
    host_utils.swap_master_and_slave(new_master)

    log.info('Removing read_only from new master {}'.format(new_master))
    mysql_lib.unset_global_read_only(new_master)
    log.info('Removing replication configuration from new master {}'.format(new_master))
    mysql_lib.reset_slave(new_master)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)
    report(master, slaves, new_master, replicas, failed_replicas)
Example #9
0
def mysql_failover(master, dry_run, skip_lock,
                   ignore_dr_slave, trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set,
                                                       repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        mysql_lib.get_master_status(slave)

        if kill_old_master and not dry_run:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(environment_specific.DRY_RUN_EXIT_CODE)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master,
                                    True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(replicas,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master,
                                        True)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt+1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    # we don't really care if this fails, but we'll print a message anyway.
    try:
        environment_specific.generic_json_post(
            environment_specific.CHANGE_FEED_URL,
            {'type': 'MySQL Failover',
             'environment': replica_set,
             'description': "Failover from {m} to {s}".format(m=master, s=slave),
             'author': host_utils.get_user(),
             'automation': False,
             'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME)})
    except Exception as e:
        log.warning("Failover completed, but change feed "
                    "not updated: {}".format(e))

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)