def swap_slave_and_dr_slave(instance, dry_run): """ Swap a slave and a dr_slave in zk Args: instance - An instance that is either a slave or dr_slave """ zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {}'.format(instance)) replica_set = zk_local.get_replica_set_from_instance(instance) log.info('Detected replica_set as {}'.format(replica_set)) (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) dr_znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) dr_parsed_data = simplejson.loads(dr_znode_data) new_dr_data = copy.deepcopy(dr_parsed_data) if replica_set not in parsed_data: raise Exception('Replica set {replica_set} is not present ' 'in dr_node'.format(replica_set=replica_set)) log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(dr_parsed_data[replica_set]))) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \ dr_parsed_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] new_dr_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_dr_data[replica_set]))) if dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) try: kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_dr_data), dr_meta.version) except: raise Exception('DR node is incorrect due to a different change ' 'blocking this change. Manual intervention ' 'is required.')
def swap_slave_and_dr_slave(instance, dry_run): """ Swap a slave and a dr_slave in zk Args: instance - An instance that is either a slave or dr_slave """ zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) (replica_set, _) = zk_local.get_replica_set_from_instance(instance) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) dr_znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) dr_parsed_data = simplejson.loads(dr_znode_data) new_dr_data = copy.deepcopy(dr_parsed_data) if replica_set not in parsed_data: raise Exception('Replica set {replica_set} is not present ' 'in dr_node'.format(replica_set=replica_set)) log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(dr_parsed_data[replica_set]))) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \ dr_parsed_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] new_dr_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_dr_data[replica_set]))) if dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) try: kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_dr_data), dr_meta.version) except: raise Exception('DR node is incorrect due to a different change ' 'blocking this change. You need to fix it yourself')
def swap_master_and_slave(instance, dry_run): """ Swap a master and slave in zk. Warning: this does not sanity checks and does nothing more than update zk. YOU HAVE BEEN WARNED! Args: instance - An instance in the replica set. This function will figure everything else out. dry_run - If set, do not modify configuration. """ zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {}'.format(instance)) replica_set = zk_local.get_replica_set_from_instance(instance) log.info('Detected replica_set as {}'.format(replica_set)) (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_MASTER] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_MASTER] log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version)
def swap_master_and_slave(instance, dry_run): """ Swap a master and slave in zk. Warning: this does not sanity checks and does nothing more than update zk. YOU HAVE BEEN WARNED! Args: instance - An instance in the replica set. This function will figure everything else out. dry_run - If set, do not modify configuration. """ zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) (replica_set, version) = zk_local.get_replica_set_from_instance(instance) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_MASTER] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_MASTER] log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version)
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave, trust_me_its_dead, kill_old_master): """ Promte a new MySQL master Args: master - Hostaddr object of the master instance to be demoted dry_run - Do not change state, just do sanity testing and exit skip_lock - Do not take a promotion lock ignore_dr_slave - Ignore the existance of a dr_slave trust_me_its_dead - Do not test to see if the master is dead kill_old_master - Send a mysqladmin kill command to the old master Returns: new_master - The new master server """ log.info('Master to demote is {master}'.format(master=master)) zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master']) log.info('Replica set is detected as ' '{replica_set}'.format(replica_set=replica_set)) # take a lock here to make sure nothing changes underneath us if not skip_lock and not dry_run: log.info('Taking promotion lock on replica set') lock_identifier = get_promotion_lock(replica_set) else: lock_identifier = None # giant try. If there any problems we roll back from the except try: master_conn = False slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) log.info('Slave/new master is detected as {slave}'.format(slave=slave)) if ignore_dr_slave: log.info('Intentionally ignoring a dr_slave') dr_slave = None else: dr_slave = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave)) if dr_slave: if dr_slave == slave: raise Exception('Slave and dr_slave appear to be the same') replicas = set([slave, dr_slave]) else: replicas = set([slave]) # let's make sure that what we think is the master, actually is confirm_replica_topology(master, replicas) # We use master_conn as a mysql connection to the master server, if # it is False, the master is dead if trust_me_its_dead: master_conn = None else: master_conn = is_master_alive(master, replicas) slave_conn = mysql_lib.connect_mysql(slave) # Test to see if the slave is setup for replication. If not, we are hosed log.info('Testing to see if Slave/new master is setup to write ' 'replication logs') try: mysql_lib.get_master_status(slave_conn) except mysql_lib.ReplicationError: log.error('New master {slave} is not setup to write replicaiton ' 'logs!'.format(slave=slave)) raise log.info('Slave/new master is setup to write replication logs') if kill_old_master: log.info('Killing old master, we hope you know what you are doing') mysql_lib.shutdown_mysql(master) master_conn = None if master_conn: log.info('Master is considered alive') dead_master = False confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) else: log.info('Master is considered dead') dead_master = True confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) if dry_run: log.info('In dry_run mode, so exiting now') # Using os._exit in order to not get catch in the giant try os._exit(0) log.info('Preliminary sanity checks complete, starting promotion') if master_conn: log.info('Setting read_only on master') mysql_lib.set_global_variable(master_conn, 'read_only', True) log.info('Confirming no writes to old master') # If there are writes with the master in read_only mode then the # promotion can not proceed. # A likely reason is a client has the SUPER privilege. confirm_no_writes(master_conn) log.info('Waiting for replicas to be caught up') confirm_max_replica_lag(replicas, 0, timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) log.info('Setting up replication from old master ({master})' 'to new master ({slave})'.format(master=master, slave=slave)) mysql_lib.setup_replication(new_master=slave, new_replica=master) else: log.info('Starting up a zk connection to make sure we can connect') kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not conect to zk') log.info('Confirming replica has processed all replication ' ' logs') confirm_no_writes(slave_conn) log.info('Looks like no writes being processed by replica via ' 'replication or other means') if len(replicas) > 1: log.info('Confirming relpica servers in sync') confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, replicas_synced=True, dead_master=dead_master) except: log.info('Starting rollback') if master_conn: log.info('Releasing read_only on old master') mysql_lib.set_global_variable(master_conn, 'read_only', False) log.info('Clearing replication settings on old master') mysql_lib.reset_slave(master_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Rollback complete, reraising exception') raise if dr_slave: try: mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave) except Exception as e: log.error(e) log.error('Setting up replication on the dr_slave failed. ' 'Failing forward!') log.info('Updating zk') zk_write_attempt = 0 while True: try: modify_mysql_zk.swap_master_and_slave(slave, dry_run=False) break except: if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS: log.info('Final failure writing to zk, bailing') raise else: log.info('Write to zk failed, trying again') zk_write_attempt = zk_write_attempt+1 log.info('Removing read_only from new master') mysql_lib.set_global_variable(slave_conn, 'read_only', False) log.info('Removing replication configuration from new master') mysql_lib.reset_slave(slave_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Failover complete') if not master_conn: log.info('As master is dead, will try to launch a replacement. Will ' 'sleep 20 seconds first to let things settle') time.sleep(20) launch_replacement_db_host.launch_replacement_db_host(master)
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave, trust_me_its_dead, kill_old_master): """ Promote a new MySQL master Args: master - Hostaddr object of the master instance to be demoted dry_run - Do not change state, just do sanity testing and exit skip_lock - Do not take a promotion lock ignore_dr_slave - Ignore the existance of a dr_slave trust_me_its_dead - Do not test to see if the master is dead kill_old_master - Send a mysqladmin kill command to the old master Returns: new_master - The new master server """ log.info('Master to demote is {master}'.format(master=master)) zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master']) log.info('Replica set is detected as ' '{replica_set}'.format(replica_set=replica_set)) # take a lock here to make sure nothing changes underneath us if not skip_lock and not dry_run: log.info('Taking promotion lock on replica set') lock_identifier = get_promotion_lock(replica_set) else: lock_identifier = None # giant try. If there any problems we roll back from the except try: master_conn = False slave = zk.get_mysql_instance_from_replica_set( replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) log.info('Slave/new master is detected as {slave}'.format(slave=slave)) if ignore_dr_slave: log.info('Intentionally ignoring a dr_slave') dr_slave = None else: dr_slave = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) log.info( 'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave)) if dr_slave: if dr_slave == slave: raise Exception('Slave and dr_slave appear to be the same') replicas = set([slave, dr_slave]) else: replicas = set([slave]) # We use master_conn as a mysql connection to the master server, if # it is False, the master is dead if trust_me_its_dead: master_conn = None else: master_conn = is_master_alive(master, replicas) # Test to see if the slave is setup for replication. If not, we are hosed log.info('Testing to see if Slave/new master is setup to write ' 'replication logs') mysql_lib.get_master_status(slave) if kill_old_master and not dry_run: log.info('Killing old master, we hope you know what you are doing') mysql_lib.shutdown_mysql(master) master_conn = None if master_conn: log.info('Master is considered alive') dead_master = False confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_NORMAL, dead_master) else: log.info('Master is considered dead') dead_master = True confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_LOOSE, dead_master) if dry_run: log.info('In dry_run mode, so exiting now') # Using os._exit in order to not get catch in the giant try os._exit(environment_specific.DRY_RUN_EXIT_CODE) log.info('Preliminary sanity checks complete, starting promotion') if master_conn: log.info('Setting read_only on master') mysql_lib.set_global_variable(master, 'read_only', True) log.info('Confirming no writes to old master') # If there are writes with the master in read_only mode then the # promotion can not proceed. # A likely reason is a client has the SUPER privilege. confirm_no_writes(master) log.info('Waiting for replicas to be caught up') confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_NONE, dead_master, True, mysql_lib.NORMAL_HEARTBEAT_LAG) log.info('Setting up replication from old master ({master}) ' 'to new master ({slave})'.format(master=master, slave=slave)) mysql_lib.setup_replication(new_master=slave, new_replica=master) else: log.info('Starting up a zk connection to make sure we can connect') kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not conect to zk') log.info('Confirming replica has processed all replication ' ' logs') confirm_no_writes(slave) log.info('Looks like no writes being processed by replica via ' 'replication or other means') if len(replicas) > 1: log.info('Confirming replica servers are synced') confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_LOOSE, dead_master, True) except: log.info('Starting rollback') if master_conn: log.info('Releasing read_only on old master') mysql_lib.set_global_variable(master, 'read_only', False) log.info('Clearing replication settings on old master') mysql_lib.reset_slave(master) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Rollback complete, reraising exception') raise if dr_slave: try: mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave) except Exception as e: log.error(e) log.error('Setting up replication on the dr_slave failed. ' 'Failing forward!') log.info('Updating zk') zk_write_attempt = 0 while True: try: modify_mysql_zk.swap_master_and_slave(slave, dry_run=False) break except: if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS: log.info('Final failure writing to zk, bailing') raise else: log.info('Write to zk failed, trying again') zk_write_attempt = zk_write_attempt + 1 log.info('Removing read_only from new master') mysql_lib.set_global_variable(slave, 'read_only', False) log.info('Removing replication configuration from new master') mysql_lib.reset_slave(slave) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Failover complete') # we don't really care if this fails, but we'll print a message anyway. try: environment_specific.generic_json_post( environment_specific.CHANGE_FEED_URL, { 'type': 'MySQL Failover', 'environment': replica_set, 'description': "Failover from {m} to {s}".format(m=master, s=slave), 'author': host_utils.get_user(), 'automation': False, 'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME) }) except Exception as e: log.warning("Failover completed, but change feed " "not updated: {}".format(e)) if not master_conn: log.info('As master is dead, will try to launch a replacement. Will ' 'sleep 20 seconds first to let things settle') time.sleep(20) launch_replacement_db_host.launch_replacement_db_host(master)
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave, trust_me_its_dead, kill_old_master): """ Promte a new MySQL master Args: master - Hostaddr object of the master instance to be demoted dry_run - Do not change state, just do sanity testing and exit skip_lock - Do not take a promotion lock ignore_dr_slave - Ignore the existance of a dr_slave trust_me_its_dead - Do not test to see if the master is dead kill_old_master - Send a mysqladmin kill command to the old master Returns: new_master - The new master server """ log.info('Master to demote is {master}'.format(master=master)) zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master']) log.info('Replica set is detected as ' '{replica_set}'.format(replica_set=replica_set)) # take a lock here to make sure nothing changes underneath us if not skip_lock and not dry_run: log.info('Taking promotion lock on replica set') lock_identifier = get_promotion_lock(replica_set) else: lock_identifier = None # giant try. If there any problems we roll back from the except try: master_conn = False slave = zk.get_mysql_instance_from_replica_set( replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) log.info('Slave/new master is detected as {slave}'.format(slave=slave)) if ignore_dr_slave: log.info('Intentionally ignoring a dr_slave') dr_slave = None else: dr_slave = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) log.info( 'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave)) if dr_slave: if dr_slave == slave: raise Exception('Slave and dr_slave appear to be the same') replicas = set([slave, dr_slave]) else: replicas = set([slave]) # let's make sure that what we think is the master, actually is confirm_replica_topology(master, replicas) # We use master_conn as a mysql connection to the master server, if # it is False, the master is dead if trust_me_its_dead: master_conn = None else: master_conn = is_master_alive(master, replicas) slave_conn = mysql_lib.connect_mysql(slave) # Test to see if the slave is setup for replication. If not, we are hosed log.info('Testing to see if Slave/new master is setup to write ' 'replication logs') try: mysql_lib.get_master_status(slave_conn) except mysql_lib.ReplicationError: log.error('New master {slave} is not setup to write replicaiton ' 'logs!'.format(slave=slave)) raise log.info('Slave/new master is setup to write replication logs') if kill_old_master: log.info('Killing old master, we hope you know what you are doing') mysql_lib.shutdown_mysql(master) master_conn = None if master_conn: log.info('Master is considered alive') dead_master = False confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) else: log.info('Master is considered dead') dead_master = True confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) if dry_run: log.info('In dry_run mode, so exiting now') # Using os._exit in order to not get catch in the giant try os._exit(0) log.info('Preliminary sanity checks complete, starting promotion') if master_conn: log.info('Setting read_only on master') mysql_lib.set_global_variable(master_conn, 'read_only', True) log.info('Confirming no writes to old master') # If there are writes with the master in read_only mode then the # promotion can not proceed. # A likely reason is a client has the SUPER privilege. confirm_no_writes(master_conn) log.info('Waiting for replicas to be caught up') confirm_max_replica_lag(replicas, 0, timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) log.info('Setting up replication from old master ({master})' 'to new master ({slave})'.format(master=master, slave=slave)) mysql_lib.setup_replication(new_master=slave, new_replica=master) else: log.info('Starting up a zk connection to make sure we can connect') kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not conect to zk') log.info('Confirming replica has processed all replication ' ' logs') confirm_no_writes(slave_conn) log.info('Looks like no writes being processed by replica via ' 'replication or other means') if len(replicas) > 1: log.info('Confirming relpica servers in sync') confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, replicas_synced=True, dead_master=dead_master) except: log.info('Starting rollback') if master_conn: log.info('Releasing read_only on old master') mysql_lib.set_global_variable(master_conn, 'read_only', False) log.info('Clearing replication settings on old master') mysql_lib.reset_slave(master_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Rollback complete, reraising exception') raise if dr_slave: try: mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave) except Exception as e: log.error(e) log.error('Setting up replication on the dr_slave failed. ' 'Failing forward!') log.info('Updating zk') zk_write_attempt = 0 while True: try: modify_mysql_zk.swap_master_and_slave(slave, dry_run=False) break except: if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS: log.info('Final failure writing to zk, bailing') raise else: log.info('Write to zk failed, trying again') zk_write_attempt = zk_write_attempt + 1 log.info('Removing read_only from new master') mysql_lib.set_global_variable(slave_conn, 'read_only', False) log.info('Removing replication configuration from new master') mysql_lib.reset_slave(slave_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Failover complete') if not master_conn: log.info('As master is dead, will try to launch a replacement. Will ' 'sleep 20 seconds first to let things settle') time.sleep(20) launch_replacement_db_host.launch_replacement_db_host(master)
def add_replica_to_zk(instance, replica_type, dry_run): """ Add a replica to zk Args: instance - A hostaddr object of the replica to add to zk replica_type - Either 'slave' or 'dr_slave'. dry_run - If set, do not modify zk """ try: if replica_type not in [ host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE ]: raise Exception('Invalid value "{replica_type}" for argument ' "replica_type").format(replica_type=replica_type) zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) mysql_lib.assert_replication_sanity(instance) mysql_lib.assert_replication_unlagged( instance, mysql_lib.REPLICATION_TOLERANCE_NORMAL) master = mysql_lib.get_master_from_instance(instance) if master not in zk_local.get_all_mysql_instances_by_type( host_utils.REPLICA_ROLE_MASTER): raise Exception('Instance {master} is not a master in zk' ''.format(master=master)) log.info('Detected master of {instance} ' 'as {master}'.format(instance=instance, master=master)) (replica_set, _) = zk_local.get_replica_set_from_instance(master) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) if replica_type == host_utils.REPLICA_ROLE_SLAVE: (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \ instance.hostname new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \ instance.port log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) parsed_data = simplejson.loads(znode_data) new_data = copy.deepcopy(parsed_data) if replica_set in parsed_data: log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) else: log.info('Replica set did not previously have a dr slave') new_data[replica_set] = \ {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname, 'port': instance.port}} log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new dr configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_data), dr_meta.version) else: # we should raise an exception above rather than getting to here pass except Exception, e: log.exception(e) raise
def add_replica_to_zk(instance, replica_type, dry_run): """ Add a replica to zk Args: instance - A hostaddr object of the replica to add to zk replica_type - Either 'slave' or 'dr_slave'. dry_run - If set, do not modify zk """ try: if replica_type not in [host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE]: raise Exception('Invalid value "{replica_type}" for argument ' "replica_type").format(replica_type=replica_type) zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {inst}'.format(inst=instance)) mysql_lib.assert_replication_sanity(instance) mysql_lib.assert_replication_unlagged(instance, mysql_lib.REPLICATION_TOLERANCE_NORMAL) master = mysql_lib.get_master_from_instance(instance) if master not in zk_local.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception('Instance {master} is not a master in zk' ''.format(master=master)) log.info('Detected master of {instance} ' 'as {master}'.format(instance=instance, master=master)) (replica_set, _) = zk_local.get_replica_set_from_instance(master) log.info('Detected replica_set as ' '{replica_set}'.format(replica_set=replica_set)) if replica_type == host_utils.REPLICA_ROLE_SLAVE: (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \ instance.hostname new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \ instance.port log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) parsed_data = simplejson.loads(znode_data) new_data = copy.deepcopy(parsed_data) if replica_set in parsed_data: log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) else: log.info('Replica set did not previously have a dr slave') new_data[replica_set] = \ {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname, 'port': instance.port}} log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new dr configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_data), dr_meta.version) else: # we should raise an exception above rather than getting to here pass except Exception, e: log.exception(e) raise
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave, trust_me_its_dead, kill_old_master): """ Promote a new MySQL master Args: master - Hostaddr object of the master instance to be demoted dry_run - Do not change state, just do sanity testing and exit skip_lock - Do not take a promotion lock ignore_dr_slave - Ignore the existance of a dr_slave trust_me_its_dead - Do not test to see if the master is dead kill_old_master - Send a mysqladmin kill command to the old master Returns: new_master - The new master server """ log.info('Master to demote is {master}'.format(master=master)) zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master']) log.info('Replica set is detected as ' '{replica_set}'.format(replica_set=replica_set)) # take a lock here to make sure nothing changes underneath us if not skip_lock and not dry_run: log.info('Taking promotion lock on replica set') lock_identifier = get_promotion_lock(replica_set) else: lock_identifier = None # giant try. If there any problems we roll back from the except try: master_conn = False slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) log.info('Slave/new master is detected as {slave}'.format(slave=slave)) if ignore_dr_slave: log.info('Intentionally ignoring a dr_slave') dr_slave = None else: dr_slave = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave)) if dr_slave: if dr_slave == slave: raise Exception('Slave and dr_slave appear to be the same') replicas = set([slave, dr_slave]) else: replicas = set([slave]) # We use master_conn as a mysql connection to the master server, if # it is False, the master is dead if trust_me_its_dead: master_conn = None else: master_conn = is_master_alive(master, replicas) # Test to see if the slave is setup for replication. If not, we are hosed log.info('Testing to see if Slave/new master is setup to write ' 'replication logs') mysql_lib.get_master_status(slave) if kill_old_master and not dry_run: log.info('Killing old master, we hope you know what you are doing') mysql_lib.shutdown_mysql(master) master_conn = None if master_conn: log.info('Master is considered alive') dead_master = False confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_NORMAL, dead_master) else: log.info('Master is considered dead') dead_master = True confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_LOOSE, dead_master) if dry_run: log.info('In dry_run mode, so exiting now') # Using os._exit in order to not get catch in the giant try os._exit(environment_specific.DRY_RUN_EXIT_CODE) log.info('Preliminary sanity checks complete, starting promotion') if master_conn: log.info('Setting read_only on master') mysql_lib.set_global_variable(master, 'read_only', True) log.info('Confirming no writes to old master') # If there are writes with the master in read_only mode then the # promotion can not proceed. # A likely reason is a client has the SUPER privilege. confirm_no_writes(master) log.info('Waiting for replicas to be caught up') confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_NONE, dead_master, True, mysql_lib.NORMAL_HEARTBEAT_LAG) log.info('Setting up replication from old master ({master}) ' 'to new master ({slave})'.format(master=master, slave=slave)) mysql_lib.setup_replication(new_master=slave, new_replica=master) else: log.info('Starting up a zk connection to make sure we can connect') kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not conect to zk') log.info('Confirming replica has processed all replication ' ' logs') confirm_no_writes(slave) log.info('Looks like no writes being processed by replica via ' 'replication or other means') if len(replicas) > 1: log.info('Confirming replica servers are synced') confirm_max_replica_lag(replicas, mysql_lib.REPLICATION_TOLERANCE_LOOSE, dead_master, True) except: log.info('Starting rollback') if master_conn: log.info('Releasing read_only on old master') mysql_lib.set_global_variable(master, 'read_only', False) log.info('Clearing replication settings on old master') mysql_lib.reset_slave(master) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Rollback complete, reraising exception') raise if dr_slave: try: mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave) except Exception as e: log.error(e) log.error('Setting up replication on the dr_slave failed. ' 'Failing forward!') log.info('Updating zk') zk_write_attempt = 0 while True: try: modify_mysql_zk.swap_master_and_slave(slave, dry_run=False) break except: if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS: log.info('Final failure writing to zk, bailing') raise else: log.info('Write to zk failed, trying again') zk_write_attempt = zk_write_attempt+1 log.info('Removing read_only from new master') mysql_lib.set_global_variable(slave, 'read_only', False) log.info('Removing replication configuration from new master') mysql_lib.reset_slave(slave) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Failover complete') # we don't really care if this fails, but we'll print a message anyway. try: environment_specific.generic_json_post( environment_specific.CHANGE_FEED_URL, {'type': 'MySQL Failover', 'environment': replica_set, 'description': "Failover from {m} to {s}".format(m=master, s=slave), 'author': host_utils.get_user(), 'automation': False, 'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME)}) except Exception as e: log.warning("Failover completed, but change feed " "not updated: {}".format(e)) if not master_conn: log.info('As master is dead, will try to launch a replacement. Will ' 'sleep 20 seconds first to let things settle') time.sleep(20) launch_replacement_db_host.launch_replacement_db_host(master)