def add_fence_to_host(hostname, dry_run, force=False): """ Add a host to fence SG group Args: hostname: A hostaddr object for the instance dry_run: Really do it or not? force: Force it, even if a master in ZK """ zk = host_utils.MysqlZookeeper() try: replica_type = zk.get_replica_type_from_instance(hostname) except: log.info("{} is not in zk ".format(hostname)) replica_type = None # We generally don't allow fencing a master, but there could be # cases where a failover has occurred and ZK is having issues, # so we do permit forcing it. if replica_type == host_utils.REPLICA_ROLE_MASTER and not force: raise Exception('Can not fence an instance which is a Master in zk') conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION) instance_id = environment_specific.get_server_metadata( hostname.hostname)['id'] log.info("{hostname} with instance id {id} will be fenced ".format( hostname=hostname, id=instance_id)) if dry_run: log.info("Do not actually run, just exit now") os._exit(environment_specific.DRY_RUN_EXIT_CODE) conn.modify_instance_attribute(instance_id, 'groupSet', [SG_DB_FENCE_ID]) log.info("Done.")
def auto_add_instance_to_zk(instance, dry_run): """ Try to do right thing in adding a server to zk Args: instance - The replacement instance dry_run - If set, do not modify zk """ try: conn = mysql_lib.get_mysqlops_connections() log.info('Determining replacement for ' '{hostname}'.format(hostname=instance.hostname)) server_metadata = environment_specific.get_server_metadata(instance.hostname) if not server_metadata: raise Exception('CMDB lacks knowledge of replacement host') instance_id = server_metadata['id'] role = determine_replacement_role(conn, instance_id) log.info('Adding server as role: {role}'.format(role=role)) except Exception, e: log.exception(e) raise
def auto_add_instance_to_zk(instance, dry_run): """ Try to do right thing in adding a server to zk Args: instance - The replacement instance dry_run - If set, do not modify zk """ try: conn = mysql_lib.get_mysqlops_connections() log.info('Determining replacement for ' '{hostname}'.format(hostname=instance.hostname)) server_metadata = environment_specific.get_server_metadata( instance.hostname) if not server_metadata: raise Exception('CMDB lacks knowledge of replacement host') instance_id = server_metadata['id'] role = determine_replacement_role(conn, instance_id) log.info('Adding server as role: {role}'.format(role=role)) except Exception, e: log.exception(e) raise
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format( host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now( ) - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False version_server = original_server except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') version_server = zk.get_mysql_instance_from_replica_set( original_server.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata( original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = { 'availability_zone': cmdb_data['location'], 'vpc_security_group': cmdb_data['security_groups'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': mysql_lib.get_global_variables(version_server)['version'][0:3], 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True } # At this point, all our defaults should be good to go config_overridden = False # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server( **replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now() - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} ago. The timeout for servers builds is " "{timeout} so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: if replace_again: log.info('A replacement has already been requested: ' '{new_host}'.format(new_host=existing_replacement)) else: raise Exception('A replacement already exists, but ' 'replace_again is not True') # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)