Example #1
0
def add_fence_to_host(hostname, dry_run, force=False):
    """ Add a host to fence SG group

        Args:
          hostname: A hostaddr object for the instance
          dry_run: Really do it or not?
          force: Force it, even if a master in ZK
    """
    zk = host_utils.MysqlZookeeper()
    try:
        replica_type = zk.get_replica_type_from_instance(hostname)
    except:
        log.info("{} is not in zk ".format(hostname))
        replica_type = None

    # We generally don't allow fencing a master, but there could be
    # cases where a failover has occurred and ZK is having issues,
    # so we do permit forcing it.
    if replica_type == host_utils.REPLICA_ROLE_MASTER and not force:
        raise Exception('Can not fence an instance which is a Master in zk')

    conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION)
    instance_id = environment_specific.get_server_metadata(
        hostname.hostname)['id']
    log.info("{hostname} with instance id {id} will be fenced ".format(
        hostname=hostname, id=instance_id))

    if dry_run:
        log.info("Do not actually run, just exit now")
        os._exit(environment_specific.DRY_RUN_EXIT_CODE)
    conn.modify_instance_attribute(instance_id, 'groupSet', [SG_DB_FENCE_ID])
    log.info("Done.")
Example #2
0
def auto_add_instance_to_zk(instance, dry_run):
    """ Try to do right thing in adding a server to zk

    Args:
    instance - The replacement instance
    dry_run - If set, do not modify zk
    """
    try:
        conn = mysql_lib.get_mysqlops_connections()
        log.info('Determining replacement for '
                 '{hostname}'.format(hostname=instance.hostname))
        server_metadata = environment_specific.get_server_metadata(instance.hostname)
        if not server_metadata:
            raise Exception('CMDB lacks knowledge of replacement host')
        instance_id = server_metadata['id']
        role = determine_replacement_role(conn, instance_id)
        log.info('Adding server as role: {role}'.format(role=role))
    except Exception, e:
        log.exception(e)
        raise
def auto_add_instance_to_zk(instance, dry_run):
    """ Try to do right thing in adding a server to zk

    Args:
    instance - The replacement instance
    dry_run - If set, do not modify zk
    """
    try:
        conn = mysql_lib.get_mysqlops_connections()
        log.info('Determining replacement for '
                 '{hostname}'.format(hostname=instance.hostname))
        server_metadata = environment_specific.get_server_metadata(
            instance.hostname)
        if not server_metadata:
            raise Exception('CMDB lacks knowledge of replacement host')
        instance_id = server_metadata['id']
        role = determine_replacement_role(conn, instance_id)
        log.info('Adding server as role: {role}'.format(role=role))
    except Exception, e:
        log.exception(e)
        raise
def launch_replacement_db_host(original_server,
                               dry_run=False,
                               not_a_replacement=False,
                               overrides=dict(),
                               reason='',
                               replace_again=False):
    """ Launch a replacement db server

    Args:
    original_server - A hostAddr object for the server to be replaced
    dry_run - If True, do not actually launch a replacement
    not_a_replacement - If set, don't log the replacement, therefore
                        automation won't put it into prod use.
    overrides - A dict of overrides. Availible keys are
                'mysql_minor_version', 'hostname', 'vpc_security_group',
                'availability_zone', 'instance_type', and 'mysql_major_version'.
    reason - A description of why the host is being replaced. If the instance
             is still accessible and reason is not supply an exception will be
             thrown.
    replace_again - If True, ignore already existing replacements.
    """
    reasons = set()
    if reason:
        reasons.add(reason)

    log.info('Trying to launch a replacement for host {host} which is part '
             'of replica set is {replica_set}'.format(
                 host=original_server.hostname,
                 replica_set=original_server.get_zk_replica_set()[0]))

    zk = host_utils.MysqlZookeeper()
    try:
        (_, replica_type) = zk.get_replica_set_from_instance(original_server)
    except:
        raise Exception('Can not replace an instance which is not in zk')
    if replica_type == host_utils.REPLICA_ROLE_MASTER:
        # If the instance, we will refuse to run. No ifs, ands, or buts/
        raise Exception('Can not replace an instance which is a master in zk')

    # Open a connection to MySQL Ops and check if a replacement has already
    # been requested
    reporting_conn = mysql_lib.get_mysqlops_connections()
    existing_replacement = find_existing_replacements(reporting_conn,
                                                      original_server)
    if existing_replacement and not not_a_replacement:
        log.info('A replacement has already been requested: '
                 '{re}'.format(re=existing_replacement))
        if replace_again:
            log.info('Argument replace_again is set, continuing on.')
        else:
            age_of_replacement = datetime.datetime.now(
            ) - existing_replacement['created_at']
            if age_of_replacement.days < SERVER_BUILD_TIMEOUT:
                raise Exception('Argument replace_again is not True but a '
                                'replacement already exists.')
            else:
                log.info("A replacement already exists, but was launched "
                         "{days} days ago. The timeout for servers builds is "
                         "{timeout} days so we are automatically setting "
                         "replace_again.".format(days=age_of_replacement.days,
                                                 timeout=SERVER_BUILD_TIMEOUT))
                replace_again = True

    # Check to see if MySQL is up on the host
    try:
        # This is not multi instance compatible. If we move to multiple
        # instances this will need to be updated
        conn = mysql_lib.connect_mysql(original_server)
        conn.close()
        dead_server = False
        version_server = original_server
    except MySQLdb.OperationalError as detail:
        dead_server = True
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        log.info('MySQL is down, assuming hardware failure')
        reasons.add('hardware failure')
        version_server = zk.get_mysql_instance_from_replica_set(
            original_server.get_zk_replica_set()[0],
            repl_type=host_utils.REPLICA_ROLE_MASTER)

    # Pull some information from cmdb.
    cmdb_data = environment_specific.get_server_metadata(
        original_server.hostname)
    if not cmdb_data:
        raise Exception('Could not find information about server to be '
                        'replaced in the cmdb')

    if 'aws_status.codes' in cmdb_data:
        reasons.add(cmdb_data['aws_status.codes'])

    log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data))
    replacement_config = {
        'availability_zone':
        cmdb_data['location'],
        'vpc_security_group':
        cmdb_data['security_groups'],
        'hostname':
        find_unused_server_name(original_server.get_standardized_replica_set(),
                                reporting_conn, dry_run),
        'instance_type':
        cmdb_data['config.instance_type'],
        'mysql_major_version':
        mysql_lib.get_global_variables(version_server)['version'][0:3],
        'mysql_minor_version':
        DEFAULT_MYSQL_MINOR_VERSION,
        'dry_run':
        dry_run,
        'skip_name_check':
        True
    }

    # At this point, all our defaults should be good to go
    config_overridden = False

    # All other overrides
    for key in overrides.keys():
        if key not in replacement_config:
            raise Exception('Invalid override {key}'.format(key=key))

        if overrides[key]:
            if replacement_config[key] == overrides[key]:
                log.info('Override for key {key} does not modify '
                         'configuration'.format(key=key))
            else:
                log.info('Overriding {key} to value {new} from {old}'
                         ''.format(key=key,
                                   old=replacement_config[key],
                                   new=overrides[key]))
                reasons.add('changing {key} from {old} to '
                            '{new}'.format(key=key,
                                           old=replacement_config[key],
                                           new=overrides[key]))
                replacement_config[key] = overrides[key]
                config_overridden = True

    if config_overridden:
        log.info('Configuration after overrides: {replacement_config}'
                 ''.format(replacement_config=replacement_config))

    if not dead_server:
        try:
            mysql_lib.assert_replication_sanity(original_server)
        except Exception as e:
            log.info('Replication problem: {e}'.format(e=e))
            reasons.add('replication broken')

    # If we get to here and there is no reason, bail out
    if not reasons and not replacement_config['dry_run']:
        raise Exception(('MySQL appears to be up and no reason for '
                         'replacement is supplied. You can specify a reason '
                         'with the --reason argument'))
    reason = ', '.join(reasons)
    log.info('Reason for launch: {reason}'.format(reason=reason))

    new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(
        **replacement_config)
    if not (replacement_config['dry_run'] or not_a_replacement):
        log_replacement_host(reporting_conn, cmdb_data, new_instance_id,
                             replace_again, replacement_config, reason)
def launch_replacement_db_host(original_server,
                               dry_run=False,
                               not_a_replacement=False,
                               overrides=dict(),
                               reason='',
                               replace_again=False):
    """ Launch a replacement db server

    Args:
    original_server - A hostAddr object for the server to be replaced
    dry_run - If True, do not actually launch a replacement
    not_a_replacement - If set, don't log the replacement, therefore
                        automation won't put it into prod use.
    overrides - A dict of overrides. Availible keys are
                'mysql_minor_version', 'hostname', 'vpc_security_group',
                'availability_zone', 'classic_security_group',
                'instance_type', and 'mysql_major_version'.
    reason - A description of why the host is being replaced. If the instance
             is still accessible and reason is not supply an exception will be
             thrown.
    replace_again - If True, ignore already existing replacements.
    """
    reasons = set()
    if reason:
        reasons.add(reason)

    log.info('Trying to launch a replacement for host {host} which is part '
             'of replica set is {replica_set}'.format(host=original_server.hostname,
                                                      replica_set=original_server.get_zk_replica_set()[0]))

    zk = host_utils.MysqlZookeeper()
    try:
        (_, replica_type) = zk.get_replica_set_from_instance(original_server)
    except:
        raise Exception('Can not replace an instance which is not in zk')
    if replica_type == host_utils.REPLICA_ROLE_MASTER:
        # If the instance, we will refuse to run. No ifs, ands, or buts/
        raise Exception('Can not replace an instance which is a master in zk')

    # Open a connection to MySQL Ops and check if a replacement has already
    # been requested
    reporting_conn = mysql_lib.get_mysqlops_connections()
    existing_replacement = find_existing_replacements(reporting_conn,
                                                      original_server)
    if existing_replacement and not not_a_replacement:
        log.info('A replacement has already been requested: '
                 '{re}'.format(re=existing_replacement))
        if replace_again:
            log.info('Argument replace_again is set, continuing on.')
        else:
            age_of_replacement = datetime.datetime.now() - existing_replacement['created_at']
            if age_of_replacement.days < SERVER_BUILD_TIMEOUT:
                raise Exception('Argument replace_again is not True but a '
                                'replacement already exists.')
            else:
                log.info("A replacement already exists, but was launched "
                         "{days} ago. The timeout for servers builds is "
                         "{timeout} so we are automatically setting "
                         "replace_again.".format(days=age_of_replacement.days,
                                                 timeout=SERVER_BUILD_TIMEOUT))
                replace_again = True

    # Pull some information from cmdb.
    cmdb_data = environment_specific.get_server_metadata(original_server.hostname)
    if not cmdb_data:
        raise Exception('Could not find information about server to be '
                        'replaced in the cmdb')

    if 'aws_status.codes' in cmdb_data:
        reasons.add(cmdb_data['aws_status.codes'])

    log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data))
    replacement_config = {'availability_zone': cmdb_data['location'],
                          'hostname': find_unused_server_name(original_server.get_standardized_replica_set(),
                                                              reporting_conn, dry_run),
                          'instance_type': cmdb_data['config.instance_type'],
                          'mysql_major_version': get_master_mysql_major_version(original_server),
                          'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION,
                          'dry_run': dry_run,
                          'skip_name_check': True}

    if cmdb_data.pop('cloud.aws.vpc_id', None):
        # Existing server is in VPC
        replacement_config['classic_security_group'] = None
        replacement_config['vpc_security_group'] = cmdb_data['security_groups']
    else:
        # Existing server is in Classic
        replacement_config['classic_security_group'] = cmdb_data['security_groups']
        replacement_config['vpc_security_group'] = None

    # At this point, all our defaults should be good to go
    config_overridden = False
    if replacement_config['classic_security_group'] and overrides['vpc_security_group']:
        # a VPC migration
        vpc_migration(replacement_config, overrides)
        reasons.add('vpc migration')
        config_overridden = True

    # All other overrides
    for key in overrides.keys():
        if key not in replacement_config:
            raise Exception('Invalid override {key}'.format(key=key))

        if overrides[key]:
            if replacement_config[key] == overrides[key]:
                log.info('Override for key {key} does not modify '
                         'configuration'.format(key=key))
            else:
                log.info('Overriding {key} to value {new} from {old}'
                         ''.format(key=key,
                                   old=replacement_config[key],
                                   new=overrides[key]))
                replacement_config[key] = overrides[key]
                reasons.add('changing {key} from {old} to '
                            '{old}'.format(key=key,
                                           old=replacement_config[key],
                                           new=overrides[key]))
                config_overridden = True

    if config_overridden:
        log.info('Configuration after overrides: {replacement_config}'
                 ''.format(replacement_config=replacement_config))

    # Check to see if MySQL is up on the host
    try:
        # This is not multi instance compatible. If we move to multiple
        # instances this will need to be updated
        conn = mysql_lib.connect_mysql(original_server)
        conn.close()
        dead_server = False
    except MySQLdb.OperationalError as detail:
        dead_server = True
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        log.info('MySQL is down, assuming hardware failure')
        reasons.add('hardware failure')

    if not dead_server:
        slave_status = mysql_lib.calc_slave_lag(original_server)
        if slave_status['ss']['Slave_SQL_Running'] != 'Yes':
            reasons.add('sql replication thread broken')

        if slave_status['ss']['Slave_IO_Running'] != 'Yes':
            reasons.add('io replication thread broken')

    # If we get to here and there is no reason, bail out
    if not reasons and not replacement_config['dry_run']:
        raise Exception(('MySQL appears to be up and no reason for '
                         'replacement is supplied. You can specify a reason'
                         'with the --reason argument'))
    reason = ', '.join(reasons)
    log.info('Reason for launch: {reason}'.format(reason=reason))

    new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config)
    if not (replacement_config['dry_run'] or not_a_replacement):
        log_replacement_host(reporting_conn, cmdb_data, new_instance_id,
                             replace_again, replacement_config, reason)
Example #6
0
def launch_replacement_db_host(original_server,
                               dry_run=False,
                               not_a_replacement=False,
                               overrides=dict(),
                               reason='',
                               replace_again=False):
    """ Launch a replacement db server

    Args:
    original_server - A hostAddr object for the server to be replaced
    dry_run - If True, do not actually launch a replacement
    not_a_replacement - If set, don't log the replacement, therefore
                        automation won't put it into prod use.
    overrides - A dict of overrides. Availible keys are
                'mysql_minor_version', 'hostname', 'vpc_security_group',
                'availability_zone', 'classic_security_group',
                'instance_type', and 'mysql_major_version'.
    reason - A description of why the host is being replaced. If the instance
             is still accessible and reason is not supply an exception will be
             thrown.
    replace_again - If True, ignore already existing replacements.
    """
    reasons = set()
    if reason:
        reasons.add(reason)

    log.info('Trying to launch a replacement for host {host} which is part '
             'of replica set is {replica_set}'.format(host=original_server.hostname,
                                                      replica_set=original_server.get_zk_replica_set()[0]))

    zk = host_utils.MysqlZookeeper()
    try:
        (_, replica_type) = zk.get_replica_set_from_instance(original_server)
    except:
        raise Exception('Can not replace an instance which is not in zk')
    if replica_type == host_utils.REPLICA_ROLE_MASTER:
        # If the instance, we will refuse to run. No ifs, ands, or buts/
        raise Exception('Can not replace an instance which is a master in zk')

    # Open a connection to MySQL Ops and check if a replacement has already
    # been requested
    reporting_conn = mysql_lib.get_mysqlops_connections()
    existing_replacement = find_existing_replacements(reporting_conn,
                                                      original_server)
    if existing_replacement and not not_a_replacement:
        if replace_again:
            log.info('A replacement has already been requested: '
                     '{new_host}'.format(new_host=existing_replacement))
        else:
            raise Exception('A replacement already exists, but '
                            'replace_again is not True')

    # Pull some information from cmdb.
    cmdb_data = environment_specific.get_server_metadata(original_server.hostname)
    if not cmdb_data:
        raise Exception('Could not find information about server to be '
                        'replaced in the cmdb')

    log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data))
    replacement_config = {'availability_zone': cmdb_data['location'],
                          'hostname': find_unused_server_name(original_server.get_standardized_replica_set(),
                                                              reporting_conn, dry_run),
                          'instance_type': cmdb_data['config.instance_type'],
                          'mysql_major_version': get_master_mysql_major_version(original_server),
                          'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION,
                          'dry_run': dry_run,
                          'skip_name_check': True}

    if cmdb_data.pop('cloud.aws.vpc_id', None):
        # Existing server is in VPC
        replacement_config['classic_security_group'] = None
        replacement_config['vpc_security_group'] = cmdb_data['security_groups']
    else:
        # Existing server is in Classic
        replacement_config['classic_security_group'] = cmdb_data['security_groups']
        replacement_config['vpc_security_group'] = None

    # At this point, all our defaults should be good to go
    config_overridden = False
    if replacement_config['classic_security_group'] and overrides['vpc_security_group']:
        # a VPC migration
        vpc_migration(replacement_config, overrides)
        reasons.add('vpc migration')
        config_overridden = True

    # All other overrides
    for key in overrides.keys():
        if key not in replacement_config:
            raise Exception('Invalid override {key}'.format(key=key))

        if overrides[key]:
            if replacement_config[key] == overrides[key]:
                log.info('Override for key {key} does not modify '
                         'configuration'.format(key=key))
            else:
                log.info('Overriding {key} to value {new} from {old}'
                         ''.format(key=key,
                                   old=replacement_config[key],
                                   new=overrides[key]))
                replacement_config[key] = overrides[key]
                reasons.add('changing {key} from {old} to '
                            '{old}'.format(key=key,
                                           old=replacement_config[key],
                                           new=overrides[key]))
                config_overridden = True

    if config_overridden:
        log.info('Configuration after overrides: {replacement_config}'
                 ''.format(replacement_config=replacement_config))

    # Check to see if MySQL is up on the host
    try:
        # This is not multi instance compatible. If we move to multiple
        # instances this will need to be updated
        conn = mysql_lib.connect_mysql(original_server)
        conn.close()
        dead_server = False
    except MySQLdb.OperationalError as detail:
        dead_server = True
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        log.info('MySQL is down, assuming hardware failure')
        reasons.add('hardware failure')

    if not dead_server:
        slave_status = mysql_lib.calc_slave_lag(original_server)
        if slave_status['ss']['Slave_SQL_Running'] != 'Yes':
            reasons.add('sql replication thread broken')

        if slave_status['ss']['Slave_IO_Running'] != 'Yes':
            reasons.add('io replication thread broken')

    # If we get to here and there is no reason, bail out
    if not reasons and not replacement_config['dry_run']:
        raise Exception(('MySQL appears to be up and no reason for '
                         'replacement is supplied'))
    reason = ', '.join(reasons)
    log.info('Reason for launch: {reason}'.format(reason=reason))

    new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config)
    if not (replacement_config['dry_run'] or not_a_replacement):
        log_replacement_host(reporting_conn, cmdb_data, new_instance_id,
                             replace_again, replacement_config, reason)