def confirm_max_replica_lag(replicas, lag_tolerance, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ start = time.time() if dead_master: replication_checks = set( [mysql_lib.CHECK_SQL_THREAD, mysql_lib.CHECK_CORRECT_MASTER]) else: replication_checks = mysql_lib.ALL_REPLICATION_CHECKS while True: acceptable = True for replica in replicas: # Confirm threads are running, expected master try: mysql_lib.assert_replication_sanity(replica, replication_checks) except Exception as e: log.warning(e) log.info('Trying to restart replication, then ' 'sleep 20 seconds') mysql_lib.restart_replication(replica) time.sleep(20) mysql_lib.assert_replication_sanity(replica, replication_checks) try: mysql_lib.assert_replication_unlagged(replica, lag_tolerance, dead_master) except Exception as e: log.warning(e) acceptable = False if replicas_synced and not confirm_replicas_in_sync(replicas): acceptable = False log.warning('Replica servers are not in sync and replicas_synced ' 'is set') if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state on ' 'replica {r}'.format(r=replica)) else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def is_master_alive(master, replicas): """ Determine if the master is alive The function will: 1. Attempt to connect to the master via the mysql protcol. If successful the master is considered alive. 2. If #1 fails, check the io thread of the replica instance(s). If the io thread is not running, the master will be considered dead. If step #1 fails and step #2 succeeds, we are in a weird state and will throw an exception. Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instances Returns: A mysql connection to the master if the master is alive, False otherwise. """ if len(replicas) == 0: raise Exception('At least one replica must be present to determine ' 'a master is dead') try: master_conn = mysql_lib.connect_mysql(master) return master_conn except MySQLdb.OperationalError as detail: (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise master_conn = False log.info('Unable to connect to current master {master} from ' '{hostname}, will check replica servers beforce declaring ' 'the master dead'.format(master=master, hostname=host_utils.HOSTNAME)) except: log.info('This is an unknown connection error. If you are very sure ' 'that the master is dead, please put a "return False" at the ' 'top of is_master_alive and then send rwultsch a stack trace') raise # We can not get a connection to the master, so poll the replica servers for replica in replicas: # If replication has not hit a timeout, a dead master can still have # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed # by a sleep will get us truthyness. mysql_lib.restart_replication(replica) try: mysql_lib.assert_replication_sanity(replica) raise Exception('Replica {replica} thinks it can connect to ' 'master {master}, but failover script can not. ' 'Possible network partition!' ''.format(replica=replica, master=master)) except: # The exception is expected in this case pass log.info('Replica {replica} also can not connect to master ' '{master}.'.format(replica=replica, master=master)) return False
def confirm_max_replica_lag(replicas, lag_tolerance, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ start = time.time() if dead_master: replication_checks = set([mysql_lib.CHECK_SQL_THREAD, mysql_lib.CHECK_CORRECT_MASTER]) else: replication_checks = mysql_lib.ALL_REPLICATION_CHECKS while True: acceptable = True for replica in replicas: # Confirm threads are running, expected master try: mysql_lib.assert_replication_sanity(replica, replication_checks) except Exception as e: log.warning(e) log.info('Trying to restart replication, then ' 'sleep 20 seconds') mysql_lib.restart_replication(replica) time.sleep(20) mysql_lib.assert_replication_sanity(replica, replication_checks) try: mysql_lib.assert_replication_unlagged(replica, lag_tolerance, dead_master) except Exception as e: log.warning(e) acceptable = False if replicas_synced and not confirm_replicas_in_sync(replicas): acceptable = False log.warning('Replica servers are not in sync and replicas_synced ' 'is set') if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state on ' 'replica {r}'.format(r=replica)) else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def restore_instance(backup_type, restore_source, destination, no_repl, date, add_to_zk, skip_production_check): """ Restore a MySQL backup on to localhost Args: backup_type - Type of backup to restore restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. """ log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Desired date of restore {date}'.format(date=date)) zk = host_utils.MysqlZookeeper() # Try to prevent unintentional destruction of prod servers log.info('Confirming no prod instances running on destination') prod_check(destination, skip_production_check) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a lock to block another restore from starting') lock_handle = host_utils.bind_lock_socket(backup.STD_BACKUP_LOCK_SOCKET) log.info('Looking for a backup to restore') if restore_source: possible_sources = [restore_source] else: possible_sources = get_possible_sources(destination, backup_type) backup_key = find_a_backup_to_restore(possible_sources, destination, backup_type, date) # Figure out what what we use to as the master when we setup replication (restore_source, _) = backup.get_metadata_from_backup_file(backup_key.name) try: replica_set = restore_source.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) except: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log(master, { 'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': backup_key.name, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk}) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} # This also ensures that all needed directories exist log.info('Rebuilding local mysql instance') lock_handle = mysql_init_server.mysql_init_server( destination, skip_production_check=True, skip_backup=True, lock_handle=lock_handle) if backup_type == backup.BACKUP_TYPE_XBSTREAM: xbstream_restore(backup_key, destination.port) if master == restore_source: log.info('Pulling replication info for restore from ' 'backup source') (binlog_file, binlog_pos, gtid_purged) = backup.parse_xtrabackup_binlog_info( destination.port) else: log.info('Pulling replication info for restore from ' 'master of backup source') # if our backup came from a GTID server, we won't have # a binlog_file and a binlog_pos, so we need to see if # we can get a set of purged GTIDs (binlog_file, binlog_pos, gtid_purged) = backup.parse_xtrabackup_slave_info( destination.port) elif backup_type == backup.BACKUP_TYPE_LOGICAL: log.info('Preparing replication') # We are importing a mysqldump which was created with # --master-data or --dump-slave so there will be a CHANGE MASTER # statement at the start of the dump. MySQL will basically just # ignore a CHANGE MASTER command if master_host is not already # setup. So we are setting master_host, username and password # here. We use BOGUS for master_log_file so that the IO thread is # intentionally broken. With no argument for master_log_file, # the IO thread would start downloading the first bin log and # the SQL thread would start executing... mysql_lib.change_master(destination, master, 'BOGUS', 0, no_start=True) # reset master on slave before we load anything to ensure that # we can set GTID info from the backup, if it exists. mysql_lib.reset_master(destination) logical_restore(backup_key, destination) host_utils.stop_mysql(destination.port) log.info('Running MySQL upgrade') host_utils.upgrade_auth_tables(destination.port) log.info('Starting MySQL') host_utils.start_mysql( destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format( defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) mysql_lib.setup_audit_plugin(destination) mysql_lib.setup_response_time_metrics(destination) restore_log_update = {'restore_status': 'OK'} # Try to configure replication. log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' if backup_type == backup.BACKUP_TYPE_XBSTREAM: # before we change master, reset master on the # slave to clear out any GTID errant transactions. mysql_lib.reset_master(destination) mysql_lib.change_master(destination, master, binlog_file, binlog_pos, gtid_purged=gtid_purged, no_start=(no_repl == 'SKIP')) elif backup_type == backup.BACKUP_TYPE_LOGICAL: if no_repl == 'SKIP': log.info('As requested, not starting replication.') else: mysql_lib.restart_replication(destination) if no_repl == 'REQ': mysql_lib.wait_for_catch_up(destination) restore_log_update['replication'] = 'OK' host_utils.manage_pt_daemons(destination.port) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: # As with mysql_init_server, we have to do one more restart to # clear out lock ownership, but here we have to also do it with # the proper config file. if lock_handle: log.info('Releasing lock and restarting MySQL') host_utils.stop_mysql(destination.port) time.sleep(5) host_utils.release_lock_socket(lock_handle) if no_repl == 'SKIP': host_utils.start_mysql( destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format( defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) else: host_utils.start_mysql(destination.port) backup.update_restore_log(master, row_id, restore_log_update) try: if add_to_zk == 'REQ': if no_repl == 'REQ': log.info('Waiting for replication again, as it may have ' 'drifted due to restart.') mysql_lib.wait_for_catch_up(destination) log.info('Waiting for IO lag in case it is still too ' 'far even wait for resync ') mysql_lib.wait_for_catch_up(destination, io=True) log.info('Adding instance to zk.') modify_mysql_zk.auto_add_instance_to_zk(destination.port, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {}".format(e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) if no_repl == 'REQ': log.info('Starting a new backup') mysql_backup.mysql_backup(destination, initial_build=True)
def confirm_max_replica_lag(replicas, max_lag, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ repl_checks = dict() start = time.time() while True: acceptable = True for replica in replicas: repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) repl_checks[replica.__str__()] = ':'.join( (repl_check['ss']['Relay_Master_Log_File'], str(repl_check['ss']['Exec_Master_Log_Pos']))) # Basic sanity if repl_check['sbm'] is None: raise Exception( 'Computed replication is unavailible for {replica}, ' 'perhaps restart pt-heartbeat ' 'on the master?'.format(replica=replica)) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': log.info('SQL thread is not running, trying to restart, then ' 'sleep 20 seconds') conn = mysql_lib.connect_mysql(replica) mysql_lib.restart_replication(conn) time.sleep(20) repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread on {replica} has serious ' 'problems'.format(replica=replica)) if max_lag == 0: if repl_check['sql_bytes'] != 0: acceptable = False log.warn('Unprocessed log on {replica} is {sql_bytes} ' 'bytes > 0' ''.format(replica=replica, sql_bytes=repl_check['sql_bytes'])) else: log.info('{replica} is in sync with the ' 'master'.format(replica=replica)) else: if repl_check['sbm'] > max_lag: acceptable = False log.warn('Lag on {replica} is {lag} seconds is greater ' 'than limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) else: log.info('Lag on {replica} is {lag} is <= limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) if replicas_synced and len(set(repl_checks.values())) != 1: acceptable = False raise Exception( 'Replica servers are not in sync and replicas_synced ' 'is set. Replication status: ' '{repl_checks}'.format(repl_checks=repl_checks)) if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state') else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def restore_instance(backup_type, restore_source, destination, no_repl, date, add_to_zk, skip_production_check): """ Restore a MySQL backup on to localhost Args: backup_type - Type of backup to restore restore_source - A hostaddr object for where to pull a backup from destination - A hostaddr object for where to restore the backup no_repl - Should replication be not started. It will always be setup. date - What date should the backup be from add_to_zk - Should the instnace be added to zk. If so, the log from the host being launched will be consulted. skip_production_check - Do not check if the host is already in zk for production use. """ log.info('Supplied source is {source}'.format(source=restore_source)) log.info('Supplied destination is {dest}'.format(dest=destination)) log.info('Desired date of restore {date}'.format(date=date)) zk = host_utils.MysqlZookeeper() # Try to prevent unintentional destruction of prod servers log.info('Confirming no prod instances running on destination') prod_check(destination, skip_production_check) # Take a lock to prevent multiple restores from running concurrently log.info('Taking a flock to block another restore from starting') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Looking for a backup to restore') if restore_source: possible_sources = [restore_source] else: possible_sources = get_possible_sources(destination, backup_type) backup_key = find_a_backup_to_restore(possible_sources, destination, backup_type, date) # Figure out what what we use to as the master when we setup replication (restore_source, _) = backup.get_metadata_from_backup_file(backup_key.name) if restore_source.get_zk_replica_set(): replica_set = restore_source.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) else: # ZK has no idea what this replica set is, probably a new replica set. master = restore_source # Start logging row_id = backup.start_restore_log(master, {'restore_source': restore_source, 'restore_port': destination.port, 'restore_file': backup_key.name, 'source_instance': destination.hostname, 'restore_date': date, 'replication': no_repl, 'zookeeper': add_to_zk}) # Giant try to allow logging if anything goes wrong. try: # If we hit an exception, this status will be used. If not, it will # be overwritten restore_log_update = {'restore_status': 'BAD'} # This also ensures that all needed directories exist log.info('Rebuilding local mysql instance') mysql_init_server.mysql_init_server(destination, skip_production_check=True, skip_backup=True, skip_locking=True) if backup_type == backup.BACKUP_TYPE_XBSTREAM: xbstream_restore(backup_key, destination.port) if master == restore_source: log.info('Pulling replication info from restore to backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_binlog_info(destination.port) else: log.info('Pulling replication info from restore to ' 'master of backup source') (binlog_file, binlog_pos) = backup.parse_xtrabackup_slave_info(destination.port) elif backup_type == backup.BACKUP_TYPE_LOGICAL: logical_restore(backup_key, destination) host_utils.stop_mysql(destination.port) log.info('Running MySQL upgrade') host_utils.upgrade_auth_tables(destination.port) log.info('Starting MySQL') host_utils.start_mysql(destination.port, options=host_utils.DEFAULTS_FILE_EXTRA_ARG.format(defaults_file=host_utils.MYSQL_NOREPL_CNF_FILE)) # Since we haven't started the slave yet, make sure we've got these # plugins installed, whether we use them or not. mysql_lib.setup_semisync_plugins(destination) restore_log_update = {'restore_status': 'OK'} # Try to configure replication. log.info('Setting up MySQL replication') restore_log_update['replication'] = 'FAIL' if backup_type == backup.BACKUP_TYPE_XBSTREAM: mysql_lib.change_master(destination, master, binlog_file, binlog_pos, no_start=(no_repl == 'SKIP')) elif backup_type == backup.BACKUP_TYPE_LOGICAL: if no_repl == 'SKIP': log.info('As requested, not starting replication.') else: mysql_lib.restart_replication(destination) if no_repl == 'REQ': mysql_lib.wait_replication_catch_up(destination) restore_log_update['replication'] = 'OK' host_utils.restart_pt_daemons(destination.port) mysql_lib.setup_response_time_metrics(destination) except Exception as e: log.error(e) if row_id is not None: restore_log_update['status_message'] = e restore_log_update['finished_at'] = True raise finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) backup.update_restore_log(master, row_id, restore_log_update) try: if add_to_zk == 'REQ': log.info('Adding instance to zk') modify_mysql_zk.auto_add_instance_to_zk(destination.port, dry_run=False) backup.update_restore_log(master, row_id, {'zookeeper': 'OK'}) else: log.info('add_to_zk is not set, therefore not adding to zk') except Exception as e: log.warning("An exception occurred: {e}".format(e=e)) log.warning("If this is a DB issue, that's fine. " "Otherwise, you should check ZK.") backup.update_restore_log(master, row_id, {'finished_at': True}) if no_repl == 'REQ': log.info('Starting a new backup') mysql_backup.mysql_backup(destination, initial_build=True)
def confirm_max_replica_lag(replicas, max_lag, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ repl_checks = dict() start = time.time() while True: acceptable = True for replica in replicas: repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) repl_checks[replica.__str__()] = ':'.join((repl_check['ss']['Relay_Master_Log_File'], str(repl_check['ss']['Exec_Master_Log_Pos']))) # Basic sanity if repl_check['sbm'] is None: raise Exception('Computed replication is unavailible for {replica}, ' 'perhaps restart pt-heartbeat ' 'on the master?'.format(replica=replica)) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': log.info('SQL thread is not running, trying to restart, then ' 'sleep 20 seconds') conn = mysql_lib.connect_mysql(replica) mysql_lib.restart_replication(conn) time.sleep(20) repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread on {replica} has serious ' 'problems'.format(replica=replica)) if max_lag == 0: if repl_check['sql_bytes'] != 0: acceptable = False log.warn('Unprocessed log on {replica} is {sql_bytes} ' 'bytes > 0' ''.format(replica=replica, sql_bytes=repl_check['sql_bytes'])) else: log.info('{replica} is in sync with the ' 'master'.format(replica=replica)) else: if repl_check['sbm'] > max_lag: acceptable = False log.warn('Lag on {replica} is {lag} seconds is greater ' 'than limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) else: log.info('Lag on {replica} is {lag} is <= limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) if replicas_synced and len(set(repl_checks.values())) != 1: acceptable = False raise Exception('Replica servers are not in sync and replicas_synced ' 'is set. Replication status: ' '{repl_checks}'.format(repl_checks=repl_checks)) if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state') else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)