def main(): parser = argparse.ArgumentParser(description='MySQL replication checker') parser.add_argument('replica', help='Replica MySQL instance to sanity check ' 'hostname[:port]') parser.add_argument('-w', '--watch_for_catch_up', help='Watch replication for catch up ', default=False, action='store_true') args = parser.parse_args() slave_hostaddr = host_utils.HostAddr(args.replica) if args.watch_for_catch_up: mysql_lib.wait_replication_catch_up(slave_hostaddr) else: ret = mysql_lib.calc_slave_lag(slave_hostaddr) print "Heartbeat_seconds_behind: {sbm}".format(sbm=ret['sbm']) print "Slave_IO_Running: {Slave_IO_Running} ".format(Slave_IO_Running=ret['ss']['Slave_IO_Running']) print "IO_lag_bytes: {io_bytes}".format(io_bytes=ret['io_bytes']) print "IO_lag_binlogs: {io_binlogs}".format(io_binlogs=ret['io_binlogs']) print "Slave_SQL_Running: {Slave_IO_Running} ".format(Slave_IO_Running=ret['ss']['Slave_SQL_Running']) print "SQL_lag_bytes: {sql_bytes}".format(sql_bytes=ret['sql_bytes']) print "SQL_lag_binlogs: {sql_binlogs}".format(sql_binlogs=ret['sql_binlogs'])
def main(): parser = argparse.ArgumentParser(description='MySQL replication checker') parser.add_argument('replica', help='Replica MySQL instance to sanity check ' 'hostname[:port]') parser.add_argument('-w', '--watch_for_catch_up', help='Watch replication for catch up ', default=False, action='store_true') args = parser.parse_args() slave_hostaddr = host_utils.HostAddr(args.replica) if args.watch_for_catch_up: mysql_lib.wait_for_catch_up(slave_hostaddr) else: ret = mysql_lib.calc_slave_lag(slave_hostaddr) print "Heartbeat_seconds_behind: {sbm}".format(sbm=ret['sbm']) print "Slave_IO_Running: {Slave_IO_Running} ".format( Slave_IO_Running=ret['ss']['Slave_IO_Running']) print "IO_lag_bytes: {io_bytes}".format(io_bytes=ret['io_bytes']) print "IO_lag_binlogs: {io_binlogs}".format( io_binlogs=ret['io_binlogs']) print "Slave_SQL_Running: {Slave_IO_Running} ".format( Slave_IO_Running=ret['ss']['Slave_SQL_Running']) print "SQL_lag_bytes: {sql_bytes}".format(sql_bytes=ret['sql_bytes']) print "SQL_lag_binlogs: {sql_binlogs}".format( sql_binlogs=ret['sql_binlogs'])
def collectReplicationStatus(db): """ Collect replication stats using mysql_lib.calc_slave_lag """ instance = host_utils.HostAddr(':'.join((socket.gethostname(), db.port))) ret = mysql_lib.calc_slave_lag(instance) printmetric(db, "slave.seconds_behind_master", ret['sbm']) printmetric(db, "slave.io_bytes_behind", ret["io_bytes"]) printmetric(db, "slave.sql_bytes_behind", ret["sql_bytes"]) printmetric(db, "slave.thread_io_running", int('yes' == ret['ss']['Slave_IO_Running'].lower())) printmetric(db, "slave.thread_sql_running", int('yes' == ret['ss']['Slave_SQL_Running'].lower()))
def sanity_check_replica(instance): """ Make sure a slave is slaving and relatively caught up Args: instance - A hostaddr object Returns: A hostaddr object of master of the instance argument """ # Test to see if the slave is setup for replication. If not, we are hosed conn = mysql_lib.connect_mysql(instance) try: mysql_lib.get_master_status(conn) except mysql_lib.ReplicationError: raise Exception('{instance} is not setup to write replicaiton ' 'logs!'.format(instance=instance)) replication = mysql_lib.calc_slave_lag(instance) if replication['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread is not running on {instance}' ''.format(instance=instance)) if replication['ss']['Slave_IO_Running'] != 'Yes': raise Exception('IO thread is not running on {instance}' ''.format(instance=instance)) if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG: raise Exception('Heartbeat lag {sbm} > {max_lag} seconds' ''.format(sbm=replication['sbm'], max_lag=mysql_lib.MAX_HEARTBEAT_LAG)) if replication['io_bytes'] > mysql_lib.MAX_IO_LAG: raise Exception('IO lag {io_bytes} > {max_io} bytes' ''.format(io_bytes=replication['io_bytes'], max_io=mysql_lib.MAX_IO_LAG)) master = host_utils.HostAddr(':'.join( (replication['ss']['Master_Host'], str(replication['ss']['Master_Port'])))) return master
def sanity_check_replica(instance): """ Make sure a slave is slaving and relatively caught up Args: instance - A hostaddr object Returns: A hostaddr object of master of the instance argument """ # Test to see if the slave is setup for replication. If not, we are hosed conn = mysql_lib.connect_mysql(instance) try: mysql_lib.get_master_status(conn) except mysql_lib.ReplicationError: raise Exception('{instance} is not setup to write replicaiton ' 'logs!'.format(instance=instance)) replication = mysql_lib.calc_slave_lag(instance) if replication['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread is not running on {instance}' ''.format(instance=instance)) if replication['ss']['Slave_IO_Running'] != 'Yes': raise Exception('IO thread is not running on {instance}' ''.format(instance=instance)) if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG: raise Exception('Heartbeat lag {sbm} > {max_lag} seconds' ''.format(sbm=replication['sbm'], max_lag=mysql_lib.MAX_HEARTBEAT_LAG)) if replication['io_bytes'] > mysql_lib.MAX_IO_LAG: raise Exception('IO lag {io_bytes} > {max_io} bytes' ''.format(io_bytes=replication['io_bytes'], max_io=mysql_lib.MAX_IO_LAG)) master = host_utils.HostAddr(':'.join((replication['ss']['Master_Host'], str(replication['ss']['Master_Port'])))) return master
def confirm_max_replica_lag(replicas, max_lag, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ repl_checks = dict() start = time.time() while True: acceptable = True for replica in replicas: repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) repl_checks[replica.__str__()] = ':'.join((repl_check['ss']['Relay_Master_Log_File'], str(repl_check['ss']['Exec_Master_Log_Pos']))) # Basic sanity if repl_check['sbm'] is None: raise Exception('Computed replication is unavailible for {replica}, ' 'perhaps restart pt-heartbeat ' 'on the master?'.format(replica=replica)) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread on replica {replica} is not running. ' 'Perhaps run start slave?'.format(replica=replica)) if max_lag == 0: if repl_check['sql_bytes'] != 0: acceptable = False log.warn('Unprocessed log on {replica} is {sql_bytes} ' 'bytes > 0' ''.format(replica=replica, sql_bytes=repl_check['sql_bytes'])) else: log.info('{replica} is in sync with the ' 'master'.format(replica=replica)) else: if repl_check['sbm'] > max_lag: acceptable = False log.warn('Lag on {replica} is {lag} seconds is greater ' 'than limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) else: log.info('Lag on {replica} is {lag} is <= limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) if replicas_synced and len(set(repl_checks.values())) != 1: acceptable = False raise Exception('Replica servers are not in sync and replicas_synced ' 'is set. Replication status: ' '{repl_checks}'.format(repl_checks=repl_checks)) if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state') else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now() - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} ago. The timeout for servers builds is " "{timeout} so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def confirm_max_replica_lag(replicas, max_lag, dead_master, replicas_synced=False, timeout=0): """ Test replication lag Args: replicas - A set of hostaddr object to be tested for replication lag max_lag - Max computed replication lag in seconds. If 0 is supplied, then exec position is compared from replica servers to the master rather than using a computed second behind as the heartbeat will be blocked by read_only. replicas_synced - Replica servers must have executed to the same position in the binary log. timeout - How long to wait for replication to be in the desired state """ repl_checks = dict() start = time.time() while True: acceptable = True for replica in replicas: repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) repl_checks[replica.__str__()] = ':'.join( (repl_check['ss']['Relay_Master_Log_File'], str(repl_check['ss']['Exec_Master_Log_Pos']))) # Basic sanity if repl_check['sbm'] is None: raise Exception( 'Computed replication is unavailible for {replica}, ' 'perhaps restart pt-heartbeat ' 'on the master?'.format(replica=replica)) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': log.info('SQL thread is not running, trying to restart, then ' 'sleep 20 seconds') conn = mysql_lib.connect_mysql(replica) mysql_lib.restart_replication(conn) time.sleep(20) repl_check = mysql_lib.calc_slave_lag(replica, dead_master=dead_master) if repl_check['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread on {replica} has serious ' 'problems'.format(replica=replica)) if max_lag == 0: if repl_check['sql_bytes'] != 0: acceptable = False log.warn('Unprocessed log on {replica} is {sql_bytes} ' 'bytes > 0' ''.format(replica=replica, sql_bytes=repl_check['sql_bytes'])) else: log.info('{replica} is in sync with the ' 'master'.format(replica=replica)) else: if repl_check['sbm'] > max_lag: acceptable = False log.warn('Lag on {replica} is {lag} seconds is greater ' 'than limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) else: log.info('Lag on {replica} is {lag} is <= limit of ' '{limit}'.format(replica=replica, limit=max_lag, lag=repl_check['sbm'])) if replicas_synced and len(set(repl_checks.values())) != 1: acceptable = False raise Exception( 'Replica servers are not in sync and replicas_synced ' 'is set. Replication status: ' '{repl_checks}'.format(repl_checks=repl_checks)) if acceptable: return elif (time.time() - start) > timeout: raise Exception('Replication is not in an acceptable state') else: log.info('Sleeping for 5 second to allow replication to catch up') time.sleep(5)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: if replace_again: log.info('A replacement has already been requested: ' '{new_host}'.format(new_host=existing_replacement)) else: raise Exception('A replacement already exists, but ' 'replace_again is not True') # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)