def main(): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('-i', '--instance', help='The instance to query. This should ' 'be the master of a replica set, but ' 'if you supply a non-master, the script ' 'will query the master anyway.') parser.add_argument('timestamp', help='The timestamp to rewind to. This must ' 'be in MySQL format: YYYY-MM-DD HH:MM:SS') args = parser.parse_args() try: instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() rt = zk.get_replica_type_from_instance(instance) if rt != host_utils.REPLICA_ROLE_MASTER: instance = zk.get_mysql_instance_from_replica_set( zk.get_replica_set_from_instance(instance), host_utils.REPLICA_ROLE_MASTER) log.info('Detected master of {i} as {m}'.format(i=args.instance, m=instance)) timestamp = dt.datetime.strptime(args.timestamp, MYSQL_DT_FORMAT) except Exception as e: log.error("Error in argument parsing: {}".format(e)) gtid = find_gtid_for_timestamp(instance, timestamp) if gtid: print gtid else: sys.exit(255)
def add_fence_to_host(hostname, dry_run, force=False): """ Add a host to fence SG group Args: hostname: A hostaddr object for the instance dry_run: Really do it or not? force: Force it, even if a master in ZK """ zk = host_utils.MysqlZookeeper() try: replica_type = zk.get_replica_type_from_instance(hostname) except: log.info("{} is not in zk ".format(hostname)) replica_type = None # We generally don't allow fencing a master, but there could be # cases where a failover has occurred and ZK is having issues, # so we do permit forcing it. if replica_type == host_utils.REPLICA_ROLE_MASTER and not force: raise Exception('Can not fence an instance which is a Master in zk') conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION) instance_id = environment_specific.get_server_metadata( hostname.hostname)['id'] log.info("{hostname} with instance id {id} will be fenced ".format( hostname=hostname, id=instance_id)) if dry_run: log.info("Do not actually run, just exit now") os._exit(environment_specific.DRY_RUN_EXIT_CODE) conn.modify_instance_attribute(instance_id, 'groupSet', [SG_DB_FENCE_ID]) log.info("Done.")
def main(): parser = argparse.ArgumentParser(description="Is ETL running on a " "different instance?") parser.add_argument('instance', nargs='?', help="Instance to inspect, default is localhost:3306", default=''.join((host_utils.HOSTNAME, ':3306'))) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() (replica_set, replica_type) = zk.get_replica_set_from_instance(instance) if replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: inst = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_SLAVE) elif replica_type == host_utils.REPLICA_ROLE_SLAVE: inst = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) else: exit_unknown_error() if not inst: # if there is not another slave in zk, there is not possibility # it is ok exit_other_slave_not_running_etl() try: running = mysql_backup_status.csv_backups_running(instance) except: exit_other_slave_not_running_etl() if not running: exit_other_slave_not_running_etl() exit_other_slave_running_etl()
def csv_backup_success_logged(instance, date): """ Check for log entries created by log_csv_backup_success Args: instance - A hostaddr object date - a string for the date Returns: True if already backed up, False otherwise """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): return False sql = ('SELECT COUNT(*) as "cnt" ' 'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'WHERE backup_date = %(date)s ' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) if cursor.fetchone()["cnt"]: return True else: return False
def prod_check(destination, skip_production_check): """ Confirm it is ok to overwrite the destination instance Args: destination - Hostaddr object for where to restore the backup skip_production_check - If set, it is ok to run on slabes """ zk = host_utils.MysqlZookeeper() try: replica_type = zk.get_replica_type_from_instance(destination) except: # instance is not in production replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Restore script must never run on a master') if replica_type: if skip_production_check: log.info('Ignoring production check. We hope you know what you ' 'are doing and we will try to take a backup in case ' 'you are wrong.') try: mysql_backup.mysql_backup(destination) except Exception as e: log.error(e) log.warning('Unable to take a backup. We will give you {time} ' 'seconds to change your mind and ^c.' ''.format(time=SCARY_TIMEOUT)) time.sleep(SCARY_TIMEOUT) else: raise Exception("It appears {instance} is in use. This is" " very dangerous!".format(instance=destination))
def check_schema(zk_prefix, tablename, tbl_hash): """Verify that a table across an entire tier has the expected schema Args: zk_prefix - The prefix of the key in the DS KZ node table - the name of the table to verify tbl_hash - the md5sum of the desired CREATE TABLE for the table Returns: A dictionary with keys that are the hash of the CREATE TABLE statement and the values are sets of hostname:port followed by a space and then the db one which the incorrect schema was found. """ incorrect = dict() zk = host_utils.MysqlZookeeper() config = zk.get_ds_mysql_config() for db in config.iteritems(): if db[0].startswith(zk_prefix): master = host_utils.HostAddr(''.join((db[1]['master']['host'], ':', str(db[1]['master']['port'])))) slave = host_utils.HostAddr(''.join((db[1]['slave']['host'], ':', str(db[1]['slave']['port'])))) master_hashes = check_instance_table(master, tablename, tbl_hash) slave_hashes = check_instance_table(slave, tablename, tbl_hash) for entry in master_hashes.iteritems(): if entry[0] not in incorrect: incorrect[entry[0]] = set() incorrect[entry[0]] = incorrect[entry[0]].union(entry[1]) for entry in slave_hashes.iteritems(): if entry[0] not in incorrect: incorrect[entry[0]] = set() incorrect[entry[0]] = incorrect[entry[0]].union(entry[1]) return incorrect
def extend_backup_lock(self, lock_identifier, extend_lock_stop_event): """ Extend a backup lock. This is to be used by a thread Args: lock_identifier - Corrosponds to a lock identifier row in the CSV_BACKUP_LOCK_TABLE_NAME. extend_lock_stop_event - An event that will be used to inform this thread to stop extending the lock """ # Assumption is that this is callled right after creating the lock last_update = time.time() while (not extend_lock_stop_event.is_set()): if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY: zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE {db}.{tbl} ' 'SET expires = NOW() + INTERVAL {locks_held_time} ' 'WHERE lock_identifier = %(lock_identifier)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME, locks_held_time=LOCKS_HELD_TIME) cursor.execute(sql, params) master_conn.commit() log.debug(cursor._executed) last_update = time.time() extend_lock_stop_event.wait(.5)
def find_mysql_backup(replica_set, date, backup_type): """ Check whether or not a given replica set has a backup in S3 Args: replica_set: The replica set we're checking for. date: The date to search for. Returns: location: The location of the backup for this replica set. Returns None if not found. """ zk = host_utils.MysqlZookeeper() for repl_type in host_utils.REPLICA_TYPES: instance = zk.get_mysql_instance_from_replica_set( replica_set, repl_type) if instance: try: backup_file = backup.get_s3_backup(instance, date, backup_type) if backup_file: return backup_file break except: # we'll get a 404 if there was no s3 backup, but that's OK, # so we can just move on to the next one. pass return None
def check_schema(zk_prefix, tablename, tbl_hash): """Verify that a table across an entire tier has the expected schema Args: zk_prefix - The prefix of the key ZK table - the name of the table to verify tbl_hash - the md5sum of the desired CREATE TABLE for the table Returns: A dictionary with keys that are the hash of the CREATE TABLE statement and the values are sets of hostname:port followed by a space and then the db one which the incorrect schema was found. """ incorrect = dict() zk = host_utils.MysqlZookeeper() for replica_set in zk.get_all_mysql_replica_sets(): if not replica_set.startswith(zk_prefix): continue for role in host_utils.REPLICA_TYPES: instance = zk.get_mysql_instance_from_replica_set( replica_set, role) hashes = check_instance_table(instance, tablename, tbl_hash) for entry in hashes.iteritems(): if entry[0] not in incorrect: incorrect[entry[0]] = set() incorrect[entry[0]] = incorrect[entry[0]].union(entry[1]) return incorrect
def get_possible_sources(destination, backup_type): """ Get a possible sources to restore a backup from. This is required due to mysqldump 5.5 not being able to use both --master_data and --slave_data Args: destination - A hostAddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XTRABACKUP Returns A list of hostAddr objects """ zk = host_utils.MysqlZookeeper() replica_set = destination.guess_zk_replica_set() possible_sources = [] for role in host_utils.REPLICA_TYPES: if (role == host_utils.REPLICA_ROLE_MASTER and backup_type == backup.BACKUP_TYPE_LOGICAL): continue else: instance = zk.get_mysql_instance_from_replica_set(replica_set, role) if instance: possible_sources.append(instance) return possible_sources
def log_binlog_upload(instance, binlog): """ Log to the master that a binlog has been uploaded Args: instance - a hostAddr object binlog - the full path to the binlog file """ zk = host_utils.MysqlZookeeper() binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime) replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() sql = ("REPLACE INTO {metadata_db}.{tbl} " "SET hostname = %(hostname)s, " " port = %(port)s, " " binlog = %(binlog)s, " " binlog_creation = %(binlog_creation)s, " " uploaded = NOW() ").format( metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME) metadata = { 'hostname': instance.hostname, 'port': str(instance.port), 'binlog': os.path.basename(binlog), 'binlog_creation': binlog_creation } cursor.execute(sql, metadata) conn.commit()
def ensure_binlog_archiving_table_sanity(instance): """ Create binlog archiving log table if missing, purge old data Args: instance - A hostAddr object. Note: this function will find the master of the instance if the instance is not a master """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.BINLOG_ARCHIVING_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( BINLOG_ARCHIVING_TABLE.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) sql = ("DELETE FROM {metadata_db}.{tbl} " "WHERE binlog_creation < now() - INTERVAL {d} DAY" "").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME, d=(environment_specific.S3_BINLOG_RETENTION + 1)) log.info(sql) cursor.execute(sql) conn.commit()
def find_mysql_backup(replica_set, date, backup_type): """ Check whether or not a given replica set has a backup in S3 Args: replica_set: The replica set we're checking for. date: The date to search for. Returns: location: The location of the backup for this replica set. Returns None if not found. """ zk = host_utils.MysqlZookeeper() for repl_type in host_utils.REPLICA_TYPES: instance = zk.get_mysql_instance_from_replica_set( replica_set, repl_type) if instance: try: backup_file = backup.get_s3_backup(instance, date, backup_type) if backup_file: return backup_file break except boto.exception.S3ResponseError: raise except Exception as e: if backup.NO_BACKUP not in e[0]: raise return None
def config_read_only(host): """ Determine how read_only should be set in the cnf file Args: host - a hostaddr object Returns: The string value of READ_ONLY_OFF or READ_ONLY_ON. """ zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(host) except: # If it is not in zk OR there is any other error, the safest thing is # to treat it as if it was not in zk and therefore read_only set to ON replica_type = None if replica_type == host_utils.REPLICA_ROLE_MASTER: log.info('Server is considered a master, therefore read_only ' 'should be OFF') return READ_ONLY_OFF elif replica_type in (host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE): log.info('Server is considered a replica, therefore read_only ' 'should be ON') return READ_ONLY_ON elif os.path.isfile(TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK): log.info('Server is not in zk and {path} exists, therefore read_only ' 'should be OFF' ''.format(path=TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK)) return READ_ONLY_OFF else: log.info('Server is not in zk and {path} does not exist, therefore ' 'read_only should be ON' ''.format(path=TOUCH_FOR_WRITABLE_IF_NOT_IN_ZK)) return READ_ONLY_ON
def log_csv_backup_success(instance, date): """ The CSV backup check can be expensive, so let's log that it is done Args: instance - A hostaddr object date - a string for the date """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): print 'Creating missing metadata table' cursor.execute( CSV_BACKUP_LOG_TABLE_DEFINITION.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.CSV_BACKUP_LOG_TABLE)) sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'SET backup_date = %(date)s, ' 'completion = NOW()' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) conn.commit()
def csv_backups_running(instance): """ Check to see if csv dumps are running Args: instance - we will use this to determine the replica set Returns: True if backups are running, False otherwise """ (dump_user, _) = mysql_lib.get_mysql_user_for_role(backup.USER_ROLE_MYSQLDUMP) replica_set = instance.get_zk_replica_set()[0] zk = host_utils.MysqlZookeeper() for slave_role in [ host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE ]: slave_instance = zk.get_mysql_instance_from_replica_set( replica_set, slave_role) if not slave_instance: continue if dump_user in mysql_lib.get_connected_users(slave_instance): return True return False
def determine_replacement_role(conn, instance_id): """ Try to determine the role an instance should be placed into Args: conn - A connection to the reporting server instance - The replacement instance Returns: The replication role which should be either 'slave' or 'dr_slave' """ zk = host_utils.MysqlZookeeper() cursor = conn.cursor() sql = ("SELECT old_host " "FROM mysqlops.host_replacement_log " "WHERE new_instance = %(new_instance)s ") params = {'new_instance': instance_id} cursor.execute(sql, params) log.info(cursor._executed) result = cursor.fetchone() if result is None: raise Exception('Could not determine replacement host') old_host = host_utils.HostAddr(result['old_host']) log.info('Host to be replaced is {old_host}' ''.format(old_host=old_host.hostname)) (_, repl_type) = zk.get_replica_set_from_instance(old_host) if repl_type == host_utils.REPLICA_ROLE_MASTER: raise Exception('Corwardly refusing to replace a master!') elif repl_type is None: raise Exception('Could not determine replacement role') else: return repl_type
def ensure_backup_locks_sanity(self): """ Release any backup locks that aren't sane. This means locks created by the same host as the caller. The instance level lock should allow this assumption to be correct. """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, CSV_BACKUP_LOCK_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME)) params = { 'hostname': self.instance.hostname, 'port': self.instance.port } sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE hostname = %(hostname)s AND ' ' port = %(port)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit()
def manage_pt_heartbeat(instance): """ Restarts ptheartbeat if it isn't currently running and the replica role type is master, or stop it if it is running on a non-master. Args: instance (host_utils.HostAddr): host to check for ptheartbeat Returns: None """ connected_users = mysql_lib.get_connected_users(instance) zk = host_utils.MysqlZookeeper() try: replica_type = zk.get_replica_type_from_instance(instance) except: replica_type = None pthb_user, pthb_pass = mysql_lib.get_mysql_user_for_role('ptheartbeat') if replica_type == host_utils.REPLICA_ROLE_MASTER and \ pthb_user not in connected_users: host_utils.manage_pt_heartbeat(instance.port) log.info('Started process pt-heartbeat') elif replica_type != host_utils.REPLICA_ROLE_MASTER and \ pthb_user in connected_users: host_utils.manage_pt_heartbeat(instance.port, action='stop') log.info('Stopped pt-heartbeat on non-master replica')
def partition_lock_exists(self, table_tuple): """ Find out if there is already a lock on one partition of a partitioned table from a host other than us. If so, we cannot backup that table here. Args: table_tuple - the tuple of table information. Returns: True if there is such a lock, False if not. """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() params = { 'table_name': table_tuple[0], 'hostname': self.instance.hostname, 'port': self.instance.port, 'active': ACTIVE } sql = ("SELECT COUNT(*) AS cnt FROM {db}.{tbl} WHERE " "lock_active = %(active)s AND " "table_name = %(table_name)s AND " "hostname <> %(hostname)s AND " "port = %(port)s").format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) row = int(cursor.fetchone()['cnt']) return (row > 0)
def swap_slave_and_dr_slave(instance, dry_run): """ Swap a slave and a dr_slave in zk Args: instance - An instance that is either a slave or dr_slave """ zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') log.info('Instance is {}'.format(instance)) replica_set = zk_local.get_replica_set_from_instance(instance) log.info('Detected replica_set as {}'.format(replica_set)) (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) dr_znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) dr_parsed_data = simplejson.loads(dr_znode_data) new_dr_data = copy.deepcopy(dr_parsed_data) if replica_set not in parsed_data: raise Exception('Replica set {replica_set} is not present ' 'in dr_node'.format(replica_set=replica_set)) log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(dr_parsed_data[replica_set]))) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] = \ dr_parsed_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] new_dr_data[replica_set][host_utils.REPLICA_ROLE_DR_SLAVE] = \ parsed_data[replica_set][host_utils.REPLICA_ROLE_SLAVE] log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_dr_data[replica_set]))) if dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) try: kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_dr_data), dr_meta.version) except: raise Exception('DR node is incorrect due to a different change ' 'blocking this change. Manual intervention ' 'is required.')
def terminate_instances(hostname=None, dry_run=False): zk = host_utils.MysqlZookeeper() username, password = mysql_lib.get_mysql_user_for_role('admin') terminate_instances = get_retirement_queue_servers(TERMINATE_INSTANCE) botoconn = boto.ec2.connect_to_region('us-east-1') if hostname: if hostname in terminate_instances: log.info('Only acting on {hostname}'.format(hostname=hostname)) terminate_instances = {hostname: terminate_instances[hostname]} else: log.info('Supplied host {hostname} is not ready ' 'for termination'.format(hostname=hostname)) return for hostname in terminate_instances: if hostname in get_protected_hosts('set'): log.warning('Host {hostname} is protected from ' 'retirement'.format(hostname=hostname)) remove_from_retirement_queue(hostname) continue for instance in zk.get_all_mysql_instances(): if instance.hostname == hostname: log.warning("It appears {instance} is in zk. This is " "very dangerous!".format(instance=instance)) remove_from_retirement_queue(hostname) continue log.info('Confirming mysql is down on ' '{hostname}'.format(hostname=hostname)) try: with timeout.timeout(3): conn = MySQLdb.connect( host=terminate_instances[hostname]['internal_ip'], user=username, passwd=password, cursorclass=MySQLdb.cursors.DictCursor) log.error('Did not get MYSQL_ERROR_CONN_HOST_ERROR, removing {} ' 'from queue'.format(hostname)) conn.close() remove_from_retirement_queue(hostname) continue except MySQLdb.OperationalError as detail: (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down') log.info('Terminating instance ' '{instance}'.format( instance=terminate_instances[hostname]['instance_id'])) if dry_run: log.info('In dry_run mode, not changing state') else: botoconn.terminate_instances( instance_ids=[terminate_instances[hostname]['instance_id']]) log_to_retirement_queue( hostname, terminate_instances[hostname]['instance_id'], TERMINATE_INSTANCE)
def verify_schema_for_migration(source_replica_set, destination_replica_set, databases, confirm_row_counts): """ Confirm that source and destination have schema and row counts in sync Args: source - A hostaddr instance for the source destination -A hostaddr instance for the destination dbs - A set of database to check confirm_row_counts - If True, check that row counts are very close to synchronized, otherwise do a very cursory check """ zk = host_utils.MysqlZookeeper() source_master = zk.get_mysql_instance_from_replica_set(source_replica_set) destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_SLAVE) destination_slave = zk.get_mysql_instance_from_replica_set( destination_replica_set, host_utils.REPLICA_ROLE_SLAVE) problems = list() for db in databases: source_tables = mysql_lib.get_tables(source_master, db) destination_tables = mysql_lib.get_tables(destination_master, db) differences = source_tables.symmetric_difference(destination_tables) if differences: problems.append('Found table existence mismatch in db {db}: {dif}' ''.format(db=db, dif=differences)) for table in source_tables: if table not in destination_tables: pass source_def = mysql_lib.show_create_table(source_master, db, table, standardize=True) destination_def = mysql_lib.show_create_table(destination_master, db, table, standardize=True) if source_def != destination_def: problems.append('Table definition mismatch db {db} ' 'table {table}' ''.format(db=db, table=table)) cnt_problem = check_row_counts(source_slave, destination_slave, db, table, exact=confirm_row_counts) if cnt_problem: problems.append(cnt_problem) if problems: raise Exception('. '.join(problems)) log.info('Schema and data appear to be in *NSYNC')
def mysql_backup(instance, backup_type=backup.BACKUP_TYPE_XBSTREAM, initial_build=False): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object backup_type - backup.BACKUP_TYPE_LOGICAL or backup.BACKUP_TYPE_XBSTREAM initial_build - Boolean, if this is being created right after the server was built """ log.info('Confirming sanity of replication (if applicable)') zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(instance) except: # instance is not in production replica_type = None if replica_type and replica_type != host_utils.REPLICA_ROLE_MASTER: mysql_lib.assert_replication_sanity(instance) log.info('Logging initial status to mysqlops') start_timestamp = time.localtime() lock_handle = None backup_id = mysql_lib.start_backup_log(instance, backup_type, start_timestamp) # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) # Actually run the backup log.info('Running backup') if backup_type == backup.BACKUP_TYPE_XBSTREAM: backup_file = backup.xtrabackup_instance(instance, start_timestamp, initial_build) elif backup_type == backup.BACKUP_TYPE_LOGICAL: backup_file = backup.logical_backup_instance( instance, start_timestamp, initial_build) else: raise Exception('Unsupported backup type {backup_type}' ''.format(backup_type=backup_type)) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle) # Update database with additional info now that backup is done. if backup_id: log.info("Updating database log entry with final backup info") mysql_lib.finalize_backup_log(backup_id, backup_file) else: log.info("The backup is complete, but we were not able to " "write to the central log DB.")
def launch_restores_as_needed(dry_run=True): """ Launch a bunch of hosts to test restore process Args: dry_run - Don't actully launch hosts """ zk = host_utils.MysqlZookeeper() launched = 0 pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(backup.get_age_last_restore, zk.get_all_mysql_replica_sets()) restore_age = dict() for result in results: if not result: continue if result[0] not in restore_age: restore_age[result[0]] = set() restore_age[result[0]].add(result[1]) launched = 0 min_launches = min_test_launches() log.info('Current restore age: {}' ''.format(pprint.pformat(restore_age))) for days in sorted(restore_age.keys(), reverse=True): for replica_set in restore_age[days]: launch = False if launched > MAX_LAUNCHED: raise Exception('Cowardly refusing to consider launching ' 'servers as we have launched {launched} which ' 'is greater than the limit of {max_launched}' ''.format(launched=launched, max_launched=MAX_LAUNCHED)) elif days > AGE_START_TESTING: launch = True log.info( 'Will replace a host in {rs} as days since last restore ' 'is {days} days and we will always launch after ' '{always} days' ''.format(rs=replica_set, days=days, always=AGE_START_TESTING)) elif launched < min_launches: launch = True log.info('Will replace a host in {rs} as launched ' '{launched} < min {min}' ''.format(rs=replica_set, launched=launched, min=min_launches)) if launch: launched = launched + 1 if not dry_run: try: launch_a_slave_replacement(replica_set) except Exception as e: log.error('Could not launch replacement due to error: ' '{e}'.format(e=e))
def get_sharded_db_missing_uploads(args): """ Check to see if all backups are present Args: A tuple which can be expanded to: table_tuple - a tuple of (db.table, partition_name, partition_num) shard_type - sharddb, etc shards - a set of shards dev_bucket - check the dev bucket instead of the prod bucket? Returns: the table name that was checked and a set of shards which are not backed up for the table in question. """ (table_tuple, date, shards, dev_bucket) = args zk = host_utils.MysqlZookeeper() expected_s3_keys = set() prefix = None table_name = table_tuple[0].split('.')[1] for shard in shards: (replica_set, db) = zk.map_shard_to_replica_and_db(shard) instance = zk.get_mysql_instance_from_replica_set( replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) (_, data_path, _) = backup.get_csv_backup_paths(instance, db, table_name, date, table_tuple[2]) expected_s3_keys.add(data_path) if not prefix: prefix = os.path.dirname(data_path) boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \ else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) uploaded_keys = set() for key in bucket.list(prefix=prefix): if key.size > 0: uploaded_keys.add(key.name) elif key.name.split('/')[-1][0] != '_': # If we have a zero-length file that doesn't start with # an underscore, it shouldn't be here. key.delete() missing_uploads = expected_s3_keys.difference(uploaded_keys) for entry in copy.copy(missing_uploads): # The list API occassionally has issues, so we will recheck any missing # entries. If any are actually missing we will quit checking because # there is definitely work that needs to be done k = bucket.get_key(entry) if k and k.size > 0: print 'List method did not return data for key:{}'.format(entry) missing_uploads.discard(entry) else: return ({'table': table_name, 'missing_uploads': missing_uploads}) return ({'table': table_name, 'missing_uploads': missing_uploads})
def min_test_launches(): """ Figure out what is the least number of test launches we should run Returns an int of the most test launches we should run """ zk = host_utils.MysqlZookeeper() # So the idea here is that often an upgrade will cause a large burst of # replacements which will then potentially cause not many servers to be # launched for a while. This will smooth out the number of services launch. return len(zk.get_all_mysql_replica_sets()) / AGE_ALARM
def check_replication_for_migration(source_replica_set, destination_replica_set): """ Confirm that replication is sane for finishing a shard migration Args: source_replica_set - Where shards are coming from destination_replica_set - Where shards are being sent """ zk = host_utils.MysqlZookeeper() source_master = zk.get_mysql_instance_from_replica_set(source_replica_set) destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_SLAVE) destination_slave = zk.get_mysql_instance_from_replica_set( destination_replica_set, host_utils.REPLICA_ROLE_SLAVE) # First we will confirm that the slave of the source is caught up # this is important for row count comparisons mysql_lib.assert_replication_unlagged( source_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # Next, the slave of the destination replica set for the same reason mysql_lib.assert_replication_unlagged( destination_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # Next, the destination master is relatively caught up to the source master mysql_lib.assert_replication_unlagged( destination_master, mysql_lib.REPLICATION_TOLERANCE_NORMAL) # We will also verify that the source master is not replicating. A scary # scenario is if the there is some sort of ring replication going and db # drops of blackhole db's would propegate to the source db. try: source_slave_status = mysql_lib.get_slave_status(source_master) except mysql_lib.ReplicationError: source_slave_status = None if source_slave_status: raise Exception('Source master is setup for replication ' 'this is super dangerous!') # We will also verify that the destination master is replicating from the # source master slave_status = mysql_lib.get_slave_status(destination_master) master_of_destination_master = host_utils.HostAddr(':'.join( (slave_status['Master_Host'], str(slave_status['Master_Port'])))) if source_master != master_of_destination_master: raise Exception('Master of destination {d} is {actual} rather than ' 'expected {expected} ' ''.format(d=destination_master, actual=master_of_destination_master, expected=destination_master)) log.info('Replication looks ok for migration')
def create_maxwell_config(client_id, instance, exclude_dbs=None, target='kafka', gtid_mode='true'): """ Create the maxwell config file. Args: client_id = The server_uuid instance = What instance is this? exclude_dbs = Exclude these databases (in addition to mysql and test) target = Output to kafka or a file (which will be /dev/null) gtid_mode = True if this is a GTID cluster, false otherwise Returns: Nothing """ template_path = os.path.join(RELATIVE_DIR, MAXWELL_TEMPLATE) with open(template_path, 'r') as f: template = f.read() (username, password) = mysql_lib.get_mysql_user_for_role('maxwell') zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance) hostname_prefix = instance.hostname_prefix if hostname_prefix in environment_specific.FLEXSHARD_DBS or hostname_prefix in environment_specific.SHARDED_DBS_PREFIX: namespace = hostname_prefix else: namespace = replica_set master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) log.info('Writing file {}'.format(MAXWELL_CONF_FILE)) excluded = ','.join(['mysql', 'test', exclude_dbs]) if exclude_dbs \ else 'mysql,test' target_map = environment_specific.MAXWELL_TARGET_MAP[ master.hostname_prefix] with open(MAXWELL_CONF_FILE, "w") as f: f.write( template.format(master_host=master.hostname, master_port=master.port, instance_host=instance.hostname, instance_port=instance.port, username=username, password=password, kafka_topic=target_map['kafka_topic'], kafka_servers=target_map['kafka_servers'], generator=target_map['generator'], zen_service=target_map['zen_service'], client_id=client_id, output=target, excludes=excluded, gtid_mode=gtid_mode, namespace=namespace))
def restart_maxwell_if_not_exists(instance): """ Start Maxwell if it isn't currently running. Args: instance: (host_utils.HostAddr): host to check Returns: none """ zk = host_utils.MysqlZookeeper() replica_type = zk.get_replica_type_from_instance(instance) gvars = mysql_lib.get_global_variables(instance) client_id = gvars['server_uuid'] gtid_mode = True if gvars.get('gtid_mode') == 'ON' else False (username, _) = mysql_lib.get_mysql_user_for_role('maxwell') output_target = 'file' # master writes to kafka, everything else writes to /dev/null, # at least for now. if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP \ and replica_type == host_utils.REPLICA_ROLE_MASTER: output_target = 'kafka' # we need to rewrite the config each time, because something may # have changed - i.e., a failover. this is just a stopgap solution # pending resolution of LP-809 mysql_cnf_builder.create_maxwell_config(client_id, instance, None, output_target, gtid_mode) # Check for the Maxwell PID file and then see if it belongs to Maxwell. maxwell_running = False try: with open(environment_specific.MAXWELL_PID, "r") as f: pid = f.read() proc = psutil.Process(int(pid)) cmdline = proc.cmdline() if 'java' in cmdline and 'com.zendesk.maxwell.Maxwell' in cmdline: maxwell_running = True except (IOError, psutil.NoSuchProcess, psutil.ZombieProcess): # No PID file or no process matching said PID, so maxwell is definitely # not running. If maxwell is a zombie then it's not running either. pass if maxwell_running: log.debug('Maxwell is already running') return if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP: host_utils.manage_maxwell(instance.port) log.info('Started Maxwell process')