Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.add_argument('-i',
                        '--instance',
                        help='The instance to query.  This should '
                        'be the master of a replica set, but '
                        'if you supply a non-master, the script '
                        'will query the master anyway.')
    parser.add_argument('timestamp',
                        help='The timestamp to rewind to.  This must '
                        'be in MySQL format: YYYY-MM-DD HH:MM:SS')
    args = parser.parse_args()
    try:
        instance = host_utils.HostAddr(args.instance)
        zk = host_utils.MysqlZookeeper()
        rt = zk.get_replica_type_from_instance(instance)
        if rt != host_utils.REPLICA_ROLE_MASTER:
            instance = zk.get_mysql_instance_from_replica_set(
                zk.get_replica_set_from_instance(instance),
                host_utils.REPLICA_ROLE_MASTER)
            log.info('Detected master of {i} as {m}'.format(i=args.instance,
                                                            m=instance))
        timestamp = dt.datetime.strptime(args.timestamp, MYSQL_DT_FORMAT)
    except Exception as e:
        log.error("Error in argument parsing: {}".format(e))

    gtid = find_gtid_for_timestamp(instance, timestamp)
    if gtid:
        print gtid
    else:
        sys.exit(255)
Esempio n. 2
0
def add_fence_to_host(hostname, dry_run, force=False):
    """ Add a host to fence SG group

        Args:
          hostname: A hostaddr object for the instance
          dry_run: Really do it or not?
          force: Force it, even if a master in ZK
    """
    zk = host_utils.MysqlZookeeper()
    try:
        replica_type = zk.get_replica_type_from_instance(hostname)
    except:
        log.info("{} is not in zk ".format(hostname))
        replica_type = None

    # We generally don't allow fencing a master, but there could be
    # cases where a failover has occurred and ZK is having issues,
    # so we do permit forcing it.
    if replica_type == host_utils.REPLICA_ROLE_MASTER and not force:
        raise Exception('Can not fence an instance which is a Master in zk')

    conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION)
    instance_id = environment_specific.get_server_metadata(
        hostname.hostname)['id']
    log.info("{hostname} with instance id {id} will be fenced ".format(
        hostname=hostname, id=instance_id))

    if dry_run:
        log.info("Do not actually run, just exit now")
        os._exit(environment_specific.DRY_RUN_EXIT_CODE)
    conn.modify_instance_attribute(instance_id, 'groupSet', [SG_DB_FENCE_ID])
    log.info("Done.")
def main():
    parser = argparse.ArgumentParser(description="Is ETL running on a "
                                     "different instance?")
    parser.add_argument('instance',
                        nargs='?',
                        help="Instance to inspect, default is localhost:3306",
                        default=''.join((host_utils.HOSTNAME, ':3306')))
    args = parser.parse_args()
    instance = host_utils.HostAddr(args.instance)

    zk = host_utils.MysqlZookeeper()
    (replica_set, replica_type) = zk.get_replica_set_from_instance(instance)

    if replica_type == host_utils.REPLICA_ROLE_DR_SLAVE:
        inst = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_SLAVE)
    elif replica_type == host_utils.REPLICA_ROLE_SLAVE:
        inst = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)
    else:
        exit_unknown_error()

    if not inst:
        # if there is not another slave in zk, there is not possibility
        # it is ok
        exit_other_slave_not_running_etl()
    try:
        running = mysql_backup_status.csv_backups_running(instance)
    except:
        exit_other_slave_not_running_etl()

    if not running:
        exit_other_slave_not_running_etl()

    exit_other_slave_running_etl()
def csv_backup_success_logged(instance, date):
    """ Check for log entries created by log_csv_backup_success

    Args:
    instance - A hostaddr object
    date - a string for the date

    Returns:
    True if already backed up, False otherwise
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.CSV_BACKUP_LOG_TABLE):
        return False

    sql = ('SELECT COUNT(*) as "cnt" '
           'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'WHERE backup_date = %(date)s '
           ''.format(
               METADATA_DB=mysql_lib.METADATA_DB,
               CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    if cursor.fetchone()["cnt"]:
        return True
    else:
        return False
def prod_check(destination, skip_production_check):
    """ Confirm it is ok to overwrite the destination instance

    Args:
    destination - Hostaddr object for where to restore the backup
    skip_production_check - If set, it is ok to run on slabes
    """
    zk = host_utils.MysqlZookeeper()
    try:
        replica_type = zk.get_replica_type_from_instance(destination)
    except:
        # instance is not in production
        replica_type = None
    if replica_type == host_utils.REPLICA_ROLE_MASTER:
        # If the instance, we will refuse to run. No ifs, ands, or buts/
        raise Exception('Restore script must never run on a master')
    if replica_type:
        if skip_production_check:
            log.info('Ignoring production check. We hope you know what you '
                     'are doing and we will try to take a backup in case '
                     'you are wrong.')
            try:
                mysql_backup.mysql_backup(destination)
            except Exception as e:
                log.error(e)
                log.warning('Unable to take a backup. We will give you {time} '
                            'seconds to change your mind and ^c.'
                            ''.format(time=SCARY_TIMEOUT))
                time.sleep(SCARY_TIMEOUT)
        else:
            raise Exception("It appears {instance} is in use. This is"
                            " very dangerous!".format(instance=destination))
Esempio n. 6
0
def check_schema(zk_prefix, tablename, tbl_hash):
    """Verify that a table across an entire tier has the expected schema

    Args:
    zk_prefix - The prefix of the key in the DS KZ node
    table - the name of the table to verify
    tbl_hash - the md5sum of the desired CREATE TABLE for the table

    Returns:
    A dictionary with keys that are the hash of the CREATE TABLE statement
    and the values are sets of hostname:port followed by a space and then the
    db one which the incorrect schema was found.
    """
    incorrect = dict()
    zk = host_utils.MysqlZookeeper()
    config = zk.get_ds_mysql_config()
    for db in config.iteritems():
        if db[0].startswith(zk_prefix):
            master = host_utils.HostAddr(''.join((db[1]['master']['host'],
                                                  ':',
                                                  str(db[1]['master']['port']))))
            slave = host_utils.HostAddr(''.join((db[1]['slave']['host'],
                                                 ':',
                                                 str(db[1]['slave']['port']))))
            master_hashes = check_instance_table(master, tablename, tbl_hash)
            slave_hashes = check_instance_table(slave, tablename, tbl_hash)
            for entry in master_hashes.iteritems():
                if entry[0] not in incorrect:
                    incorrect[entry[0]] = set()
                incorrect[entry[0]] = incorrect[entry[0]].union(entry[1])
            for entry in slave_hashes.iteritems():
                if entry[0] not in incorrect:
                    incorrect[entry[0]] = set()
                incorrect[entry[0]] = incorrect[entry[0]].union(entry[1])
    return incorrect
    def extend_backup_lock(self, lock_identifier, extend_lock_stop_event):
        """ Extend a backup lock. This is to be used by a thread

        Args:
        lock_identifier - Corrosponds to a lock identifier row in the
                          CSV_BACKUP_LOCK_TABLE_NAME.
        extend_lock_stop_event - An event that will be used to inform this
                                 thread to stop extending the lock
        """
        # Assumption is that this is callled right after creating the lock
        last_update = time.time()
        while (not extend_lock_stop_event.is_set()):
            if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY:
                zk = host_utils.MysqlZookeeper()
                replica_set = zk.get_replica_set_from_instance(self.instance)
                master = zk.get_mysql_instance_from_replica_set(
                    replica_set, host_utils.REPLICA_ROLE_MASTER)
                master_conn = mysql_lib.connect_mysql(master, role='dbascript')
                cursor = master_conn.cursor()

                params = {'lock_identifier': lock_identifier}
                sql = ('UPDATE {db}.{tbl} '
                       'SET expires = NOW() + INTERVAL {locks_held_time} '
                       'WHERE lock_identifier = %(lock_identifier)s'
                       '').format(db=mysql_lib.METADATA_DB,
                                  tbl=CSV_BACKUP_LOCK_TABLE_NAME,
                                  locks_held_time=LOCKS_HELD_TIME)
                cursor.execute(sql, params)
                master_conn.commit()
                log.debug(cursor._executed)
                last_update = time.time()
            extend_lock_stop_event.wait(.5)
Esempio n. 8
0
def find_mysql_backup(replica_set, date, backup_type):
    """ Check whether or not a given replica set has a backup in S3

    Args:
        replica_set: The replica set we're checking for.
        date: The date to search for.

    Returns:
        location: The location of the backup for this replica set.
                  Returns None if not found.
    """
    zk = host_utils.MysqlZookeeper()
    for repl_type in host_utils.REPLICA_TYPES:
        instance = zk.get_mysql_instance_from_replica_set(
            replica_set, repl_type)
        if instance:
            try:
                backup_file = backup.get_s3_backup(instance, date, backup_type)
                if backup_file:
                    return backup_file
                break
            except:
                # we'll get a 404 if there was no s3 backup, but that's OK,
                # so we can just move on to the next one.
                pass
    return None
Esempio n. 9
0
def check_schema(zk_prefix, tablename, tbl_hash):
    """Verify that a table across an entire tier has the expected schema

    Args:
    zk_prefix - The prefix of the key ZK
    table - the name of the table to verify
    tbl_hash - the md5sum of the desired CREATE TABLE for the table

    Returns:
    A dictionary with keys that are the hash of the CREATE TABLE statement
    and the values are sets of hostname:port followed by a space and then the
    db one which the incorrect schema was found.
    """
    incorrect = dict()
    zk = host_utils.MysqlZookeeper()
    for replica_set in zk.get_all_mysql_replica_sets():
        if not replica_set.startswith(zk_prefix):
            continue

        for role in host_utils.REPLICA_TYPES:
            instance = zk.get_mysql_instance_from_replica_set(
                replica_set, role)
            hashes = check_instance_table(instance, tablename, tbl_hash)
            for entry in hashes.iteritems():
                if entry[0] not in incorrect:
                    incorrect[entry[0]] = set()
                incorrect[entry[0]] = incorrect[entry[0]].union(entry[1])
    return incorrect
def get_possible_sources(destination, backup_type):
    """ Get a possible sources to restore a backup from. This is required due
        to mysqldump 5.5 not being able to use both --master_data and
        --slave_data

    Args:
    destination - A hostAddr object
    backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XTRABACKUP

    Returns A list of hostAddr objects
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = destination.guess_zk_replica_set()
    possible_sources = []
    for role in host_utils.REPLICA_TYPES:
        if (role == host_utils.REPLICA_ROLE_MASTER and
                backup_type == backup.BACKUP_TYPE_LOGICAL):
            continue
        else:
            instance = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              role)
            if instance:
                possible_sources.append(instance)

    return possible_sources
def log_binlog_upload(instance, binlog):
    """ Log to the master that a binlog has been uploaded

    Args:
    instance - a hostAddr object
    binlog - the full path to the binlog file
    """
    zk = host_utils.MysqlZookeeper()
    binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime)
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    sql = ("REPLACE INTO {metadata_db}.{tbl} "
           "SET hostname = %(hostname)s, "
           "    port = %(port)s, "
           "    binlog = %(binlog)s, "
           "    binlog_creation = %(binlog_creation)s, "
           "    uploaded = NOW() ").format(
               metadata_db=mysql_lib.METADATA_DB,
               tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)
    metadata = {
        'hostname': instance.hostname,
        'port': str(instance.port),
        'binlog': os.path.basename(binlog),
        'binlog_creation': binlog_creation
    }
    cursor.execute(sql, metadata)
    conn.commit()
def ensure_binlog_archiving_table_sanity(instance):
    """ Create binlog archiving log table if missing, purge old data

    Args:
    instance - A hostAddr object. Note: this function will find the master of
               the instance if the instance is not a master
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()
    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.BINLOG_ARCHIVING_TABLE_NAME):
        log.debug('Creating missing metadata table')
        cursor.execute(
            BINLOG_ARCHIVING_TABLE.format(
                db=mysql_lib.METADATA_DB,
                tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME))
    sql = ("DELETE FROM {metadata_db}.{tbl} "
           "WHERE binlog_creation < now() - INTERVAL {d} DAY"
           "").format(metadata_db=mysql_lib.METADATA_DB,
                      tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME,
                      d=(environment_specific.S3_BINLOG_RETENTION + 1))
    log.info(sql)
    cursor.execute(sql)
    conn.commit()
def find_mysql_backup(replica_set, date, backup_type):
    """ Check whether or not a given replica set has a backup in S3

    Args:
        replica_set: The replica set we're checking for.
        date: The date to search for.

    Returns:
        location: The location of the backup for this replica set.
                  Returns None if not found.
    """
    zk = host_utils.MysqlZookeeper()
    for repl_type in host_utils.REPLICA_TYPES:
        instance = zk.get_mysql_instance_from_replica_set(
            replica_set, repl_type)
        if instance:
            try:
                backup_file = backup.get_s3_backup(instance, date, backup_type)
                if backup_file:
                    return backup_file
                break
            except boto.exception.S3ResponseError:
                raise
            except Exception as e:
                if backup.NO_BACKUP not in e[0]:
                    raise
    return None
Esempio n. 14
0
def config_read_only(host):
    """ Determine how read_only should be set in the cnf file

    Args:
    host - a hostaddr object

    Returns:
    The string value of READ_ONLY_OFF or READ_ONLY_ON.
    """
    zk = host_utils.MysqlZookeeper()
    try:
        (_, replica_type) = zk.get_replica_set_from_instance(host)
    except:
        # If it is not in zk OR there is any other error, the safest thing is
        # to treat it as if it was not in zk and therefore read_only set to ON
        replica_type = None
    if replica_type == host_utils.REPLICA_ROLE_MASTER:
        log.info('Server is considered a master, therefore read_only '
                 'should be OFF')
        return READ_ONLY_OFF
    elif replica_type in (host_utils.REPLICA_ROLE_DR_SLAVE,
                          host_utils.REPLICA_ROLE_SLAVE):
        log.info('Server is considered a replica, therefore read_only '
                 'should be ON')
        return READ_ONLY_ON
    elif os.path.isfile(TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK):
        log.info('Server is not in zk and {path} exists, therefore read_only '
                 'should be OFF'
                 ''.format(path=TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK))
        return READ_ONLY_OFF
    else:
        log.info('Server is not in zk and {path} does not exist, therefore '
                 'read_only should be ON'
                 ''.format(path=TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK))
        return READ_ONLY_ON
def log_csv_backup_success(instance, date):
    """ The CSV backup check can be expensive, so let's log that it is done

    Args:
    instance - A hostaddr object
    date - a string for the date
    """
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    master = zk.get_mysql_instance_from_replica_set(replica_set)
    conn = mysql_lib.connect_mysql(master, 'scriptrw')
    cursor = conn.cursor()

    if not mysql_lib.does_table_exist(
            master, mysql_lib.METADATA_DB,
            environment_specific.CSV_BACKUP_LOG_TABLE):
        print 'Creating missing metadata table'
        cursor.execute(
            CSV_BACKUP_LOG_TABLE_DEFINITION.format(
                db=mysql_lib.METADATA_DB,
                tbl=environment_specific.CSV_BACKUP_LOG_TABLE))

    sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} '
           'SET backup_date = %(date)s, '
           'completion = NOW()'
           ''.format(
               METADATA_DB=mysql_lib.METADATA_DB,
               CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE))
    cursor.execute(sql, {'date': date})
    conn.commit()
def csv_backups_running(instance):
    """ Check to see if csv dumps are running

    Args:
    instance - we will use this to determine the replica set

    Returns:
    True if backups are running, False otherwise
    """
    (dump_user,
     _) = mysql_lib.get_mysql_user_for_role(backup.USER_ROLE_MYSQLDUMP)
    replica_set = instance.get_zk_replica_set()[0]
    zk = host_utils.MysqlZookeeper()

    for slave_role in [
            host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE
    ]:
        slave_instance = zk.get_mysql_instance_from_replica_set(
            replica_set, slave_role)
        if not slave_instance:
            continue

        if dump_user in mysql_lib.get_connected_users(slave_instance):
            return True

    return False
Esempio n. 17
0
def determine_replacement_role(conn, instance_id):
    """ Try to determine the role an instance should be placed into

    Args:
    conn - A connection to the reporting server
    instance - The replacement instance

    Returns:
    The replication role which should be either 'slave' or 'dr_slave'
    """
    zk = host_utils.MysqlZookeeper()
    cursor = conn.cursor()
    sql = ("SELECT old_host "
           "FROM mysqlops.host_replacement_log "
           "WHERE new_instance = %(new_instance)s ")
    params = {'new_instance': instance_id}
    cursor.execute(sql, params)
    log.info(cursor._executed)
    result = cursor.fetchone()
    if result is None:
        raise Exception('Could not determine replacement host')

    old_host = host_utils.HostAddr(result['old_host'])
    log.info('Host to be replaced is {old_host}'
             ''.format(old_host=old_host.hostname))

    (_, repl_type) = zk.get_replica_set_from_instance(old_host)

    if repl_type == host_utils.REPLICA_ROLE_MASTER:
        raise Exception('Corwardly refusing to replace a master!')
    elif repl_type is None:
        raise Exception('Could not determine replacement role')
    else:
        return repl_type
    def ensure_backup_locks_sanity(self):
        """ Release any backup locks that aren't sane. This means locks
            created by the same host as the caller. The instance level lock
            should allow this assumption to be correct.
        """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()

        if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB,
                                          CSV_BACKUP_LOCK_TABLE_NAME):
            log.debug('Creating missing metadata table')
            cursor.execute(
                CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB,
                                             tbl=CSV_BACKUP_LOCK_TABLE_NAME))

        params = {
            'hostname': self.instance.hostname,
            'port': self.instance.port
        }
        sql = ('UPDATE {db}.{tbl} '
               'SET lock_active = NULL, released = NOW() '
               'WHERE hostname = %(hostname)s AND '
               '      port = %(port)s'
               '').format(db=mysql_lib.METADATA_DB,
                          tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        master_conn.commit()
def manage_pt_heartbeat(instance):
    """
    Restarts ptheartbeat if it isn't currently running and the
    replica role type is master, or stop it if it is running on
    a non-master.

    Args:
        instance (host_utils.HostAddr): host to check for ptheartbeat

    Returns:
        None
    """
    connected_users = mysql_lib.get_connected_users(instance)
    zk = host_utils.MysqlZookeeper()
    try:
        replica_type = zk.get_replica_type_from_instance(instance)
    except:
        replica_type = None
    pthb_user, pthb_pass = mysql_lib.get_mysql_user_for_role('ptheartbeat')
    if replica_type == host_utils.REPLICA_ROLE_MASTER and \
            pthb_user not in connected_users:
        host_utils.manage_pt_heartbeat(instance.port)
        log.info('Started process pt-heartbeat')
    elif replica_type != host_utils.REPLICA_ROLE_MASTER and \
            pthb_user in connected_users:
        host_utils.manage_pt_heartbeat(instance.port, action='stop')
        log.info('Stopped pt-heartbeat on non-master replica')
    def partition_lock_exists(self, table_tuple):
        """ Find out if there is already a lock on one partition of a
            partitioned table from a host other than us.  If so, we
            cannot backup that table here.
        Args:
            table_tuple - the tuple of table information.

        Returns:
            True if there is such a lock, False if not.
        """
        zk = host_utils.MysqlZookeeper()
        replica_set = zk.get_replica_set_from_instance(self.instance)
        master = zk.get_mysql_instance_from_replica_set(
            replica_set, host_utils.REPLICA_ROLE_MASTER)
        master_conn = mysql_lib.connect_mysql(master, role='dbascript')
        cursor = master_conn.cursor()
        params = {
            'table_name': table_tuple[0],
            'hostname': self.instance.hostname,
            'port': self.instance.port,
            'active': ACTIVE
        }

        sql = ("SELECT COUNT(*) AS cnt FROM {db}.{tbl} WHERE "
               "lock_active = %(active)s AND "
               "table_name = %(table_name)s AND "
               "hostname <> %(hostname)s AND "
               "port = %(port)s").format(db=mysql_lib.METADATA_DB,
                                         tbl=CSV_BACKUP_LOCK_TABLE_NAME)
        cursor.execute(sql, params)
        row = int(cursor.fetchone()['cnt'])
        return (row > 0)
Esempio n. 21
0
def swap_slave_and_dr_slave(instance, dry_run):
    """ Swap a slave and a dr_slave in zk

    Args:
    instance - An instance that is either a slave or dr_slave
    """
    zk_local = host_utils.MysqlZookeeper()
    kazoo_client = environment_specific.get_kazoo_client()
    if not kazoo_client:
        raise Exception('Could not get a zk connection')

    log.info('Instance is {}'.format(instance))
    replica_set = zk_local.get_replica_set_from_instance(instance)

    log.info('Detected replica_set as {}'.format(replica_set))
    (zk_node,
     parsed_data,
     version) = get_zk_node_for_replica_set(kazoo_client, replica_set)
    log.info('Replica set {replica_set} is held in zk_node '
             '{zk_node}'.format(zk_node=zk_node,
                                replica_set=replica_set))

    log.info('Existing config:')
    log.info(pprint.pformat(remove_auth(parsed_data[replica_set])))
    new_data = copy.deepcopy(parsed_data)

    dr_znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK)
    dr_parsed_data = simplejson.loads(dr_znode_data)
    new_dr_data = copy.deepcopy(dr_parsed_data)
    if replica_set not in parsed_data:
        raise Exception('Replica set {replica_set} is not present '
                        'in dr_node'.format(replica_set=replica_set))
    log.info('Existing dr config:')
    log.info(pprint.pformat(remove_auth(dr_parsed_data[replica_set])))

    new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \
        dr_parsed_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE]
    new_dr_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] = \
        parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]

    log.info('New config:')
    log.info(pprint.pformat(remove_auth(new_data[replica_set])))

    log.info('New dr config:')
    log.info(pprint.pformat(remove_auth(new_dr_data[replica_set])))

    if dry_run:
        log.info('dry_run is set, therefore not modifying zk')
    else:
        log.info('Pushing new configuration for '
                 '{replica_set}:'.format(replica_set=replica_set))
        kazoo_client.set(zk_node, simplejson.dumps(new_data), version)
        try:
            kazoo_client.set(environment_specific.DR_ZK,
                             simplejson.dumps(new_dr_data), dr_meta.version)
        except:
            raise Exception('DR node is incorrect due to a different change '
                            'blocking this change.  Manual intervention '
                            'is required.')
Esempio n. 22
0
def terminate_instances(hostname=None, dry_run=False):
    zk = host_utils.MysqlZookeeper()
    username, password = mysql_lib.get_mysql_user_for_role('admin')
    terminate_instances = get_retirement_queue_servers(TERMINATE_INSTANCE)
    botoconn = boto.ec2.connect_to_region('us-east-1')

    if hostname:
        if hostname in terminate_instances:
            log.info('Only acting on {hostname}'.format(hostname=hostname))
            terminate_instances = {hostname: terminate_instances[hostname]}
        else:
            log.info('Supplied host {hostname} is not ready '
                     'for termination'.format(hostname=hostname))
            return

    for hostname in terminate_instances:
        if hostname in get_protected_hosts('set'):
            log.warning('Host {hostname} is protected from '
                        'retirement'.format(hostname=hostname))
            remove_from_retirement_queue(hostname)
            continue
        for instance in zk.get_all_mysql_instances():
            if instance.hostname == hostname:
                log.warning("It appears {instance} is in zk. This is "
                            "very dangerous!".format(instance=instance))
                remove_from_retirement_queue(hostname)
                continue

        log.info('Confirming mysql is down on '
                 '{hostname}'.format(hostname=hostname))

        try:
            with timeout.timeout(3):
                conn = MySQLdb.connect(
                    host=terminate_instances[hostname]['internal_ip'],
                    user=username,
                    passwd=password,
                    cursorclass=MySQLdb.cursors.DictCursor)
            log.error('Did not get MYSQL_ERROR_CONN_HOST_ERROR, removing {} '
                      'from queue'.format(hostname))
            conn.close()
            remove_from_retirement_queue(hostname)
            continue
        except MySQLdb.OperationalError as detail:
            (error_code, msg) = detail.args
            if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
                raise
            log.info('MySQL is down')
        log.info('Terminating instance '
                 '{instance}'.format(
                     instance=terminate_instances[hostname]['instance_id']))
        if dry_run:
            log.info('In dry_run mode, not changing state')
        else:
            botoconn.terminate_instances(
                instance_ids=[terminate_instances[hostname]['instance_id']])
            log_to_retirement_queue(
                hostname, terminate_instances[hostname]['instance_id'],
                TERMINATE_INSTANCE)
Esempio n. 23
0
def verify_schema_for_migration(source_replica_set, destination_replica_set,
                                databases, confirm_row_counts):
    """ Confirm that source and destination have schema and row counts in sync

    Args:
    source - A hostaddr instance for the source
    destination -A hostaddr instance for the destination
    dbs - A set of database to check
    confirm_row_counts - If True, check that row counts are very close to
                         synchronized, otherwise do a very cursory check
    """
    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)
    source_slave = zk.get_mysql_instance_from_replica_set(
        source_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    destination_slave = zk.get_mysql_instance_from_replica_set(
        destination_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    problems = list()
    for db in databases:
        source_tables = mysql_lib.get_tables(source_master, db)
        destination_tables = mysql_lib.get_tables(destination_master, db)

        differences = source_tables.symmetric_difference(destination_tables)
        if differences:
            problems.append('Found table existence mismatch in db {db}: {dif}'
                            ''.format(db=db, dif=differences))

        for table in source_tables:
            if table not in destination_tables:
                pass
            source_def = mysql_lib.show_create_table(source_master,
                                                     db,
                                                     table,
                                                     standardize=True)

            destination_def = mysql_lib.show_create_table(destination_master,
                                                          db,
                                                          table,
                                                          standardize=True)

            if source_def != destination_def:
                problems.append('Table definition mismatch db {db} '
                                'table {table}'
                                ''.format(db=db, table=table))

            cnt_problem = check_row_counts(source_slave,
                                           destination_slave,
                                           db,
                                           table,
                                           exact=confirm_row_counts)
            if cnt_problem:
                problems.append(cnt_problem)

    if problems:
        raise Exception('. '.join(problems))

    log.info('Schema and data appear to be in *NSYNC')
Esempio n. 24
0
def mysql_backup(instance,
                 backup_type=backup.BACKUP_TYPE_XBSTREAM,
                 initial_build=False):
    """ Run a file based backup on a supplied local instance

    Args:
    instance - A hostaddr object
    backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM
    initial_build - Boolean, if this is being created right after the server
                    was built
    """
    log.info('Confirming sanity of replication (if applicable)')
    zk = host_utils.MysqlZookeeper()
    try:
        (_, replica_type) = zk.get_replica_set_from_instance(instance)
    except:
        # instance is not in production
        replica_type = None

    if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER:
        mysql_lib.assert_replication_sanity(instance)

    log.info('Logging initial status to mysqlops')
    start_timestamp = time.localtime()
    lock_handle = None
    backup_id = mysql_lib.start_backup_log(instance, backup_type,
                                           start_timestamp)

    # Take a lock to prevent multiple backups from running concurrently
    try:
        log.info('Taking backup lock')
        lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE)

        # Actually run the backup
        log.info('Running backup')
        if backup_type == backup.BACKUP_TYPE_XBSTREAM:
            backup_file = backup.xtrabackup_instance(instance, start_timestamp,
                                                     initial_build)
        elif backup_type == backup.BACKUP_TYPE_LOGICAL:
            backup_file = backup.logical_backup_instance(
                instance, start_timestamp, initial_build)
        else:
            raise Exception('Unsupported backup type {backup_type}'
                            ''.format(backup_type=backup_type))
    finally:
        if lock_handle:
            log.info('Releasing lock')
            host_utils.release_flock_lock(lock_handle)

    # Update database with additional info now that backup is done.
    if backup_id:
        log.info("Updating database log entry with final backup info")
        mysql_lib.finalize_backup_log(backup_id, backup_file)
    else:
        log.info("The backup is complete, but we were not able to "
                 "write to the central log DB.")
Esempio n. 25
0
def launch_restores_as_needed(dry_run=True):
    """ Launch a bunch of hosts to test restore process

    Args:
    dry_run - Don't actully launch hosts
    """
    zk = host_utils.MysqlZookeeper()
    launched = 0
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    results = pool.map(backup.get_age_last_restore,
                       zk.get_all_mysql_replica_sets())
    restore_age = dict()
    for result in results:
        if not result:
            continue
        if result[0] not in restore_age:
            restore_age[result[0]] = set()
        restore_age[result[0]].add(result[1])

    launched = 0
    min_launches = min_test_launches()
    log.info('Current restore age: {}' ''.format(pprint.pformat(restore_age)))
    for days in sorted(restore_age.keys(), reverse=True):
        for replica_set in restore_age[days]:
            launch = False
            if launched > MAX_LAUNCHED:
                raise Exception('Cowardly refusing to consider launching '
                                'servers as we have launched {launched} which '
                                'is greater than the limit of {max_launched}'
                                ''.format(launched=launched,
                                          max_launched=MAX_LAUNCHED))
            elif days > AGE_START_TESTING:
                launch = True
                log.info(
                    'Will replace a host in {rs} as days since last restore '
                    'is {days} days and we will always launch after '
                    '{always} days'
                    ''.format(rs=replica_set,
                              days=days,
                              always=AGE_START_TESTING))
            elif launched < min_launches:
                launch = True
                log.info('Will replace a host in {rs} as launched '
                         '{launched} < min {min}'
                         ''.format(rs=replica_set,
                                   launched=launched,
                                   min=min_launches))

            if launch:
                launched = launched + 1
                if not dry_run:
                    try:
                        launch_a_slave_replacement(replica_set)
                    except Exception as e:
                        log.error('Could not launch replacement due to error: '
                                  '{e}'.format(e=e))
Esempio n. 26
0
def get_sharded_db_missing_uploads(args):
    """ Check to see if all backups are present

    Args: A tuple which can be expanded to:
        table_tuple - a tuple of (db.table, partition_name, partition_num)
        shard_type - sharddb, etc
        shards -  a set of shards
        dev_bucket - check the dev bucket instead of the prod bucket?

    Returns: the table name that was checked and a set of shards which
             are not backed up for the table in question.
    """
    (table_tuple, date, shards, dev_bucket) = args
    zk = host_utils.MysqlZookeeper()
    expected_s3_keys = set()
    prefix = None
    table_name = table_tuple[0].split('.')[1]

    for shard in shards:
        (replica_set, db) = zk.map_shard_to_replica_and_db(shard)
        instance = zk.get_mysql_instance_from_replica_set(
            replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE)
        (_, data_path,
         _) = backup.get_csv_backup_paths(instance, db, table_name, date,
                                          table_tuple[2])
        expected_s3_keys.add(data_path)
        if not prefix:
            prefix = os.path.dirname(data_path)

    boto_conn = boto.connect_s3()
    bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \
                    else environment_specific.S3_CSV_BUCKET
    bucket = boto_conn.get_bucket(bucket_name, validate=False)
    uploaded_keys = set()
    for key in bucket.list(prefix=prefix):
        if key.size > 0:
            uploaded_keys.add(key.name)
        elif key.name.split('/')[-1][0] != '_':
            # If we have a zero-length file that doesn't start with
            # an underscore, it shouldn't be here.
            key.delete()

    missing_uploads = expected_s3_keys.difference(uploaded_keys)

    for entry in copy.copy(missing_uploads):
        # The list API occassionally has issues, so we will recheck any missing
        # entries. If any are actually missing we will quit checking because
        # there is definitely work that needs to be done
        k = bucket.get_key(entry)
        if k and k.size > 0:
            print 'List method did not return data for key:{}'.format(entry)
            missing_uploads.discard(entry)
        else:
            return ({'table': table_name, 'missing_uploads': missing_uploads})

    return ({'table': table_name, 'missing_uploads': missing_uploads})
Esempio n. 27
0
def min_test_launches():
    """ Figure out what is the least number of test launches we should run

    Returns an int of the most test launches we should run
    """
    zk = host_utils.MysqlZookeeper()
    # So the idea here is that often an upgrade will cause a large burst of
    # replacements which will then potentially cause not many servers to be
    # launched for a while. This will smooth out the number of services launch.
    return len(zk.get_all_mysql_replica_sets()) / AGE_ALARM
Esempio n. 28
0
def check_replication_for_migration(source_replica_set,
                                    destination_replica_set):
    """ Confirm that replication is sane for finishing a shard migration

    Args:
    source_replica_set - Where shards are coming from
    destination_replica_set - Where shards are being sent
    """
    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)
    source_slave = zk.get_mysql_instance_from_replica_set(
        source_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    destination_slave = zk.get_mysql_instance_from_replica_set(
        destination_replica_set, host_utils.REPLICA_ROLE_SLAVE)

    # First we will confirm that the slave of the source is caught up
    # this is important for row count comparisons
    mysql_lib.assert_replication_unlagged(
        source_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # Next, the slave of the destination replica set for the same reason
    mysql_lib.assert_replication_unlagged(
        destination_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # Next, the destination master is relatively caught up to the source master
    mysql_lib.assert_replication_unlagged(
        destination_master, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # We will also verify that the source master is not replicating. A scary
    # scenario is if the there is some sort of ring replication going and db
    # drops of blackhole db's would propegate to the source db.
    try:
        source_slave_status = mysql_lib.get_slave_status(source_master)
    except mysql_lib.ReplicationError:
        source_slave_status = None

    if source_slave_status:
        raise Exception('Source master is setup for replication '
                        'this is super dangerous!')

    # We will also verify that the destination master is replicating from the
    # source master
    slave_status = mysql_lib.get_slave_status(destination_master)
    master_of_destination_master = host_utils.HostAddr(':'.join(
        (slave_status['Master_Host'], str(slave_status['Master_Port']))))
    if source_master != master_of_destination_master:
        raise Exception('Master of destination {d} is {actual} rather than '
                        'expected {expected} '
                        ''.format(d=destination_master,
                                  actual=master_of_destination_master,
                                  expected=destination_master))
    log.info('Replication looks ok for migration')
def create_maxwell_config(client_id,
                          instance,
                          exclude_dbs=None,
                          target='kafka',
                          gtid_mode='true'):
    """ Create the maxwell config file.

    Args:
        client_id = The server_uuid
        instance = What instance is this?
        exclude_dbs = Exclude these databases (in addition to mysql and test)
        target = Output to kafka or a file (which will be /dev/null)
        gtid_mode = True if this is a GTID cluster, false otherwise
    Returns:
        Nothing
    """
    template_path = os.path.join(RELATIVE_DIR, MAXWELL_TEMPLATE)
    with open(template_path, 'r') as f:
        template = f.read()

    (username, password) = mysql_lib.get_mysql_user_for_role('maxwell')
    zk = host_utils.MysqlZookeeper()
    replica_set = zk.get_replica_set_from_instance(instance)
    hostname_prefix = instance.hostname_prefix
    if hostname_prefix in environment_specific.FLEXSHARD_DBS or hostname_prefix in environment_specific.SHARDED_DBS_PREFIX:
        namespace = hostname_prefix
    else:
        namespace = replica_set
    master = zk.get_mysql_instance_from_replica_set(
        replica_set, host_utils.REPLICA_ROLE_MASTER)
    log.info('Writing file {}'.format(MAXWELL_CONF_FILE))
    excluded = ','.join(['mysql', 'test', exclude_dbs]) if exclude_dbs \
                else 'mysql,test'

    target_map = environment_specific.MAXWELL_TARGET_MAP[
        master.hostname_prefix]
    with open(MAXWELL_CONF_FILE, "w") as f:
        f.write(
            template.format(master_host=master.hostname,
                            master_port=master.port,
                            instance_host=instance.hostname,
                            instance_port=instance.port,
                            username=username,
                            password=password,
                            kafka_topic=target_map['kafka_topic'],
                            kafka_servers=target_map['kafka_servers'],
                            generator=target_map['generator'],
                            zen_service=target_map['zen_service'],
                            client_id=client_id,
                            output=target,
                            excludes=excluded,
                            gtid_mode=gtid_mode,
                            namespace=namespace))
def restart_maxwell_if_not_exists(instance):
    """ Start Maxwell if it isn't currently running.
    Args:
        instance: (host_utils.HostAddr): host to check
    Returns:
        none
    """
    zk = host_utils.MysqlZookeeper()
    replica_type = zk.get_replica_type_from_instance(instance)
    gvars = mysql_lib.get_global_variables(instance)

    client_id = gvars['server_uuid']
    gtid_mode = True if gvars.get('gtid_mode') == 'ON' else False
    (username, _) = mysql_lib.get_mysql_user_for_role('maxwell')

    output_target = 'file'

    # master writes to kafka, everything else writes to /dev/null,
    # at least for now.
    if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP \
            and replica_type == host_utils.REPLICA_ROLE_MASTER:
        output_target = 'kafka'

    # we need to rewrite the config each time, because something may
    # have changed - i.e., a failover.  this is just a stopgap solution
    # pending resolution of LP-809
    mysql_cnf_builder.create_maxwell_config(client_id, instance, None,
                                            output_target, gtid_mode)

    # Check for the Maxwell PID file and then see if it belongs to Maxwell.
    maxwell_running = False
    try:
        with open(environment_specific.MAXWELL_PID, "r") as f:
            pid = f.read()

        proc = psutil.Process(int(pid))
        cmdline = proc.cmdline()

        if 'java' in cmdline and 'com.zendesk.maxwell.Maxwell' in cmdline:
            maxwell_running = True

    except (IOError, psutil.NoSuchProcess, psutil.ZombieProcess):
        # No PID file or no process matching said PID, so maxwell is definitely
        # not running. If maxwell is a zombie then it's not running either.
        pass

    if maxwell_running:
        log.debug('Maxwell is already running')
        return

    if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP:
        host_utils.manage_maxwell(instance.port)
        log.info('Started Maxwell process')