def is_master_alive(master, replicas): """ Determine if the master is alive The function will: 1. Attempt to connect to the master via the mysql protcol. If successful the master is considered alive. 2. If #1 fails, check the io thread of the replica instance(s). If the io thread is not running, the master will be considered dead. If step #1 fails and step #2 succeeds, we are in a weird state and will throw an exception. Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instances Returns: A mysql connection to the master if the master is alive, False otherwise. """ if len(replicas) == 0: raise Exception('At least one replica must be present to determine ' 'a master is dead') try: master_conn = mysql_lib.connect_mysql(master) return master_conn except MySQLdb.OperationalError as detail: (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise master_conn = False log.info('Unable to connect to current master {master} from ' '{hostname}, will check replica servers beforce declaring ' 'the master dead'.format(master=master, hostname=host_utils.HOSTNAME)) except: log.info('This is an unknown connection error. If you are very sure ' 'that the master is dead, please put a "return False" at the ' 'top of is_master_alive and then send rwultsch a stack trace') raise # We can not get a connection to the master, so poll the replica servers for replica in replicas: conn = mysql_lib.connect_mysql(replica) # If replication has not hit a timeout, a dead master can still have # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed # by a sleep will get us truthyness. mysql_lib.restart_replication(conn) ss = mysql_lib.get_slave_status(conn) if ss['Slave_IO_Running'] == 'Yes': raise Exception('Replica {replica} thinks it can connect to ' 'master {master}, but failover script can not. ' 'Possible network partition!' ''.format(replica=replica, master=master)) else: log.info('Replica {replica} also can not connect to master ' '{master}.'.format(replica=replica, master=master)) return False
def extend_backup_lock(self, lock_identifier, extend_lock_stop_event): """ Extend a backup lock. This is to be used by a thread Args: lock_identifier - Corrosponds to a lock identifier row in the CSV_BACKUP_LOCK_TABLE_NAME. extend_lock_stop_event - An event that will be used to inform this thread to stop extending the lock """ # Assumption is that this is callled right after creating the lock last_update = time.time() while (not extend_lock_stop_event.is_set()): if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY: zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE {db}.{tbl} ' 'SET expires = NOW() + INTERVAL {locks_held_time} ' 'WHERE lock_identifier = %(lock_identifier)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME, locks_held_time=LOCKS_HELD_TIME) cursor.execute(sql, params) master_conn.commit() log.debug(cursor._executed) last_update = time.time() extend_lock_stop_event.wait(.5)
def csv_backup_success_logged(instance, date): """ Check for log entries created by log_csv_backup_success Args: instance - A hostaddr object date - a string for the date Returns: True if already backed up, False otherwise """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): return False sql = ('SELECT COUNT(*) as "cnt" ' 'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'WHERE backup_date = %(date)s ' ''.format(METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) if cursor.fetchone()["cnt"]: return True else: return False
def log_csv_backup_success(instance, date): """ The CSV backup check can be expensive, so let's log that it is done Args: instance - A hostaddr object date - a string for the date """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): print 'Creating missing metadata table' cursor.execute(CSV_BACKUP_LOG_TABLE_DEFINITION.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.CSV_BACKUP_LOG_TABLE)) sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'SET backup_date = %(date)s, ' 'completion = NOW()' ''.format(METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) conn.commit()
def csv_backup_success_logged(instance, date): """ Check for log entries created by log_csv_backup_success Args: instance - A hostaddr object date - a string for the date Returns: True if already backed up, False otherwise """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): return False sql = ('SELECT COUNT(*) as "cnt" ' 'FROM {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'WHERE backup_date = %(date)s ' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) if cursor.fetchone()["cnt"]: return True else: return False
def extend_backup_lock(self, lock_identifier, extend_lock_stop_event): """ Extend a backup lock. This is to be used by a thread Args: lock_identifier - Corrosponds to a lock identifier row in the CSV_BACKUP_LOCK_TABLE_NAME. extend_lock_stop_event - An event that will be used to inform this thread to stop extending the lock """ # Assumption is that this is callled right after creating the lock last_update = time.time() while(not extend_lock_stop_event.is_set()): if (time.time() - last_update) > LOCK_EXTEND_FREQUENCY: zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE {db}.{tbl} ' 'SET expires = NOW() + INTERVAL {locks_held_time} ' 'WHERE lock_identifier = %(lock_identifier)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME, locks_held_time=LOCKS_HELD_TIME) cursor.execute(sql, params) master_conn.commit() log.debug(cursor._executed) last_update = time.time() extend_lock_stop_event.wait(.5)
def get_db_size_from_log(instance, db): """ Get yesterdays db size for an instance Args: instance - A hostaddr object db - A database that exists on the instance Returns: size in MB """ conn = mysql_lib.connect_mysql(instance, 'dbascript') cursor = conn.cursor() sql = ("SELECT SUM(size_mb) as 'mb', " " COUNT(1) as 'table_count' " "FROM {metadata_db}.{tbl} " "WHERE db = %(db)s " " AND reported_at=CURDATE() - INTERVAL 1 DAY " " AND hostname=%(hostname)s and port=%(port)s " "GROUP BY db;") params = {'hostname': instance.hostname, 'port': instance.port, 'db': db} cursor.execute( sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), params) ret = cursor.fetchone() expected_tables = mysql_lib.get_tables(instance, db, skip_views=True) if ret['table_count'] != len(expected_tables): raise Exception('Size data appears to be missing for {db} on {inst}' ''.format(db=db, inst=instance)) return ret['mb']
def ensure_backup_locks_sanity(self): """ Release any backup locks that aren't sane. This means locks created by the same host as the caller. The instance level flock should allow this assumption to be correct. """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, CSV_BACKUP_LOCK_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute(CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME)) params = {'hostname': self.instance.hostname, 'port': self.instance.port} sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE hostname = %(hostname)s AND ' ' port = %(port)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit()
def check_one_replica(slave_instance, db, tbl): diff_count = -1 elapsed_time_ms = -1 try: conn = mysql_lib.connect_mysql(slave_instance, 'scriptro') cursor = conn.cursor() # first, count the diffs sql = ("SELECT COUNT(*) AS diffs FROM test.checksum " "WHERE (master_cnt <> this_cnt " "OR master_crc <> this_crc " "OR ISNULL(master_crc) <> ISNULL(this_crc)) " "AND (db=%(db)s AND tbl=%(tbl)s)") cursor.execute(sql, {'db': db, 'tbl': tbl}) row = cursor.fetchone() if row is not None: diff_count = row['diffs'] # second, sum up the elapsed time. sql = ("SELECT ROUND(SUM(chunk_time)*1000) AS time_ms " "FROM test.checksum WHERE db=%(db)s AND tbl=%(tbl)s") cursor.execute(sql, {'db': db, 'tbl': tbl}) row = cursor.fetchone() if row is not None: elapsed_time_ms = row['time_ms'] cursor.close() conn.close() except Exception as e: raise Exception("An error occurred polling the " "replica: {e}".format(e=e)) return elapsed_time_ms, diff_count
def log_binlog_upload(instance, binlog): """ Log to the master that a binlog has been uploaded Args: instance - a hostAddr object binlog - the full path to the binlog file """ zk = host_utils.MysqlZookeeper() binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime) replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() sql = ("REPLACE INTO {metadata_db}.{tbl} " "SET hostname = %(hostname)s, " " port = %(port)s, " " binlog = %(binlog)s, " " binlog_creation = %(binlog_creation)s, " " uploaded = NOW() ").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME) metadata = {'hostname': instance.hostname, 'port': str(instance.port), 'binlog': os.path.basename(binlog), 'binlog_creation': binlog_creation} cursor.execute(sql, metadata) conn.commit()
def log_csv_backup_success(instance, date): """ The CSV backup check can be expensive, so let's log that it is done Args: instance - A hostaddr object date - a string for the date """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.CSV_BACKUP_LOG_TABLE): print 'Creating missing metadata table' cursor.execute( CSV_BACKUP_LOG_TABLE_DEFINITION.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.CSV_BACKUP_LOG_TABLE)) sql = ('INSERT IGNORE INTO {METADATA_DB}.{CSV_BACKUP_LOG_TABLE} ' 'SET backup_date = %(date)s, ' 'completion = NOW()' ''.format( METADATA_DB=mysql_lib.METADATA_DB, CSV_BACKUP_LOG_TABLE=environment_specific.CSV_BACKUP_LOG_TABLE)) cursor.execute(sql, {'date': date}) conn.commit()
def partition_lock_exists(self, table_tuple): """ Find out if there is already a lock on one partition of a partitioned table from a host other than us. If so, we cannot backup that table here. Args: table_tuple - the tuple of table information. Returns: True if there is such a lock, False if not. """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() params = { 'table_name': table_tuple[0], 'hostname': self.instance.hostname, 'port': self.instance.port, 'active': ACTIVE } sql = ("SELECT COUNT(*) AS cnt FROM {db}.{tbl} WHERE " "lock_active = %(active)s AND " "table_name = %(table_name)s AND " "hostname <> %(hostname)s AND " "port = %(port)s").format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) row = int(cursor.fetchone()['cnt']) return (row > 0)
def ensure_binlog_archiving_table_sanity(instance): """ Create binlog archiving log table if missing, purge old data Args: instance - A hostAddr object. Note: this function will find the master of the instance if the instance is not a master """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, environment_specific.BINLOG_ARCHIVING_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute(BINLOG_ARCHIVING_TABLE.format(db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) sql = ("DELETE FROM {metadata_db}.{tbl} " "WHERE binlog_creation < now() - INTERVAL {d} DAY" "").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME, d=(environment_specific.S3_BINLOG_RETENTION+1)) log.info(sql) cursor.execute(sql) conn.commit()
def check_instance_table(hostaddr, table, desired_hash): """ Check that a table on a MySQL instance has the expected schema Args: hostaddr - object describing which mysql instance to connect to table - the name of the table to verify desired_hash - the md5sum of the desired CREATE TABLE for the table Returns: A dictionary with keys that are the hash of the CREATE TABLE statement and the values are sets of hostname:port followed by a space and then the db one which the incorrect schema was found. """ ret = dict() conn = mysql_lib.connect_mysql(hostaddr) for db in mysql_lib.get_dbs(conn): definition = mysql_lib.show_create_table(conn, db, table) tbl_hash = hashlib.md5(definition).hexdigest() if tbl_hash != desired_hash: if tbl_hash not in ret: ret[tbl_hash] = set() ret[tbl_hash].add(''.join((hostaddr.__str__(), ' ', db))) return ret
def log_binlog_upload(instance, binlog): """ Log to the master that a binlog has been uploaded Args: instance - a hostAddr object binlog - the full path to the binlog file """ zk = host_utils.MysqlZookeeper() binlog_creation = datetime.datetime.fromtimestamp(os.stat(binlog).st_atime) replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() sql = ("REPLACE INTO {metadata_db}.{tbl} " "SET hostname = %(hostname)s, " " port = %(port)s, " " binlog = %(binlog)s, " " binlog_creation = %(binlog_creation)s, " " uploaded = NOW() ").format( metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME) metadata = { 'hostname': instance.hostname, 'port': str(instance.port), 'binlog': os.path.basename(binlog), 'binlog_creation': binlog_creation } cursor.execute(sql, metadata) conn.commit()
def ensure_backup_locks_sanity(self): """ Release any backup locks that aren't sane. This means locks created by the same host as the caller. The instance level lock should allow this assumption to be correct. """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, CSV_BACKUP_LOCK_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( CSV_BACKUP_LOCK_TABLE.format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME)) params = { 'hostname': self.instance.hostname, 'port': self.instance.port } sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE hostname = %(hostname)s AND ' ' port = %(port)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit()
def get_logged_binlog_uploads(instance): """ Get all binlogs that have been logged as uploaded Args: instance - a hostAddr object to run against and check Returns: A set of binlog file names """ conn = mysql_lib.connect_mysql(instance, 'scriptro') cursor = conn.cursor() sql = ("SELECT binlog " "FROM {metadata_db}.{tbl} " "WHERE hostname = %(hostname)s AND " " port = %(port)s " "".format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) cursor.execute(sql, { 'hostname': instance.hostname, 'port': str(instance.port) }) ret = set() for binlog in cursor.fetchall(): ret.add(binlog['binlog']) return ret
def ensure_binlog_archiving_table_sanity(instance): """ Create binlog archiving log table if missing, purge old data Args: instance - A hostAddr object. Note: this function will find the master of the instance if the instance is not a master """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance)[0] master = zk.get_mysql_instance_from_replica_set(replica_set) conn = mysql_lib.connect_mysql(master, 'scriptrw') cursor = conn.cursor() if not mysql_lib.does_table_exist( master, mysql_lib.METADATA_DB, environment_specific.BINLOG_ARCHIVING_TABLE_NAME): log.debug('Creating missing metadata table') cursor.execute( BINLOG_ARCHIVING_TABLE.format( db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) sql = ("DELETE FROM {metadata_db}.{tbl} " "WHERE binlog_creation < now() - INTERVAL {d} DAY" "").format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME, d=(environment_specific.S3_BINLOG_RETENTION + 1)) log.info(sql) cursor.execute(sql) conn.commit()
def get_binlog_start(binlog_file, instance, username, password): """ Read the first event in a binlog so that we can extract the timestamp. This should help us skip over binlogs that can't possibly contain the GTID we're looking for. Args: binlog_file: the binlog to examine instance: a hostaddr object username: the user to connect as password: the password to connect as Returns: A timestamp in MySQL-friendly format, or we throw an exception if something doesn't work. """ # first, make sure we have the proper log positions. # most likely, this is always 4 and 120, and that's what # we'll default to, but it could be different with # different versions of MySQL. # start_pos = 4 stop_pos = 120 try: conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() sql = 'SHOW BINLOG EVENTS in %(binlog)s LIMIT 0,1' cursor.execute(sql, {'binlog': binlog_file}) row = cursor.fetchone() start_pos = row['Pos'] stop_pos = row['End_log_pos'] except Exception as e: log.error('Unable to retrieve binlog positions: {}'.format(e)) raise binlog_cmd = [ '/usr/bin/mysqlbinlog', '--read-from-remote-server', '--host={}'.format(instance.hostname), '--user={}'.format(username), '--password={}'.format(password), '--start-position="{}"'.format(start_pos), '--stop-position="{}"'.format(stop_pos), binlog_file, '2>/dev/null' ] pipeline = list() pipeline.append(dict(args=' '.join(binlog_cmd), shell=True)) pipeline.append(dict(args='/bin/egrep created', shell=True)) procs = pipe_runner(pipeline) results = pipe_wait(procs) try: (date, time) = results.split()[-2:] timestamp = dt.datetime.strptime('{} {}'.format(date, time), BINLOG_DT_FORMAT) return timestamp except Exception as e: log.error("Invalid value/format for binlog create time: {}".format(e)) raise
def rename_db_to_drop(instance, dbs, verbose=False, dry_run=False): """ Create a new empty db and move the contents of the original db there Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) if not orphaned: print "Detected no orphans" sys.exit(1) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # confirm that renames would not be blocked by an existing table conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: renamed_db = ''.join((DB_PREPEND, db)) sql = ''.join(("SELECT CONCAT(t2.TABLE_SCHEMA, \n", " '.', t2.TABLE_NAME) as tbl \n", "FROM information_schema.tables t1 \n", "INNER JOIN information_schema.tables t2 \n", " USING(TABLE_NAME) \n", "WHERE t1.TABLE_SCHEMA = %(old_db)s AND \n" " t2.TABLE_SCHEMA = %(new_db)s;")) params = {'old_db': db, 'new_db': renamed_db} cursor = conn.cursor() cursor.execute(sql, params) dups = cursor.fetchall() if dups: for dup in dups: print "Table rename blocked by {tbl}".format(tbl=dup['tbl']) sys.exit(1) # We should be safe to create the new db and rename if not dry_run: mysql_lib.create_db(conn, renamed_db) mysql_lib.move_db_contents(conn=conn, old_db=db, new_db=renamed_db, verbose=verbose, dry_run=dry_run)
def rename_db_to_drop(instance, dbs, verbose=False, dry_run=False): """ Create a new empty db and move the contents of the original db there Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) if not orphaned: print "Detected no orphans" sys.exit(1) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # confirm that renames would not be blocked by an existing table conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: renamed_db = ''.join((DB_PREPEND, db)) sql = ''.join(("SELECT CONCAT(t2.TABLE_SCHEMA, \n", " '.', t2.TABLE_NAME) as tbl \n", "FROM information_schema.tables t1 \n", "INNER JOIN information_schema.tables t2 \n", " USING(TABLE_NAME) \n", "WHERE t1.TABLE_SCHEMA = %(old_db)s AND \n" " t2.TABLE_SCHEMA = %(new_db)s;")) params = {'old_db': db, 'new_db': renamed_db} cursor = conn.cursor() cursor.execute(sql, params) dups = cursor.fetchall() if dups: for dup in dups: print "Table rename blocked by {tbl}".format(tbl=dup['tbl']) sys.exit(1) # We should be safe to create the new db and rename if not dry_run: mysql_lib.create_db(instance, renamed_db) mysql_lib.move_db_contents(instance, old_db=db, new_db=renamed_db, verbose=verbose, dry_run=dry_run)
def main(): parser = argparse.ArgumentParser(description='MySQL schema verifier') parser.add_argument('instance_type', help='Type of MySQL instance to verify', choices=('sharddb', 'modsharddb')) parser.add_argument('table', help='Table to check',) parser.add_argument('seed_instance', help=('Which host from which to fetch a table ' ' definition. (format hostname[:port])'),) parser.add_argument('seed_db', help=('Which db on --seed_instance from which to fetch' ' a table definition. (ex pbdata012345)')) args = parser.parse_args() if args.instance_type == 'sharddb': zk_prefix = SHARDDB_PREFIX elif args.instance_type == 'modsharddb': zk_prefix = MODSHARDDB_PREFIX seed_instance = host_utils.HostAddr(args.seed_instance) seed_conn = mysql_lib.connect_mysql(seed_instance) desired = mysql_lib.show_create_table(seed_conn, args.seed_db, args.table) tbl_hash = hashlib.md5(desired).hexdigest() print ("Desired table definition:\n{desired}").format(desired=desired) incorrect = check_schema(zk_prefix, args.table, tbl_hash) if len(incorrect) == 0: print "It appears that all schema is synced" sys.exit(0) d = difflib.Differ() for problem in incorrect.iteritems(): represenative = list(problem[1])[0].split(' ') hostaddr = host_utils.HostAddr(represenative[0]) conn = mysql_lib.connect_mysql(hostaddr) create = mysql_lib.show_create_table(conn, represenative[1], args.table) diff = d.compare(desired.splitlines(), create.splitlines()) print 'The following difference has been found:' print '\n'.join(diff) print "It is present on the following db's:" print '\n'.join(list(problem[1])) sys.exit(1)
def main(): action_desc = """Action description rename - after checking no recent changes and shard not in zk, create a db with the old name appended to 'dropme_'. Then copy all tables to the new db revert_rename - Copy all tables back from a 'dropme_' to their original table drop - This should be run a few days after a rename. Drop the empty original db, and drop the 'dropme_' db. """ parser = argparse.ArgumentParser(description='MySQL shard cleanup utility', epilog=action_desc, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--action', choices=('rename', 'revert_rename', 'drop',), required=True) parser.add_argument('-d', '--dbs', help=("Comma seperated list of db's to act upon"), required=True) parser.add_argument('-r', '--dry_run', help=("Do not change any state"), default=False, action='store_true') parser.add_argument('-v', '--verbose', default=False, action='store_true') args = parser.parse_args() dbs = set(args.dbs.split(',')) instance = host_utils.HostAddr(args.instance) if args.action == 'rename': rename_db_to_drop(instance, dbs, args.verbose, args.dry_run) elif args.action == 'revert_rename': conn = mysql_lib.connect_mysql(instance) for db in dbs: mysql_lib.move_db_contents(conn=conn, old_db=''.join((DB_PREPEND, db)), new_db=db, verbose=args.verbose, dry_run=args.dry_run) elif args.action == 'drop': drop_db_after_rename(instance, dbs, args.verbose, args.dry_run)
def create_table_size_table(instance): """ Create the table_size_historic table Args: a hostAddr object for the master of the replica set """ conn = mysql_lib.connect_mysql(instance, 'scriptrw') cursor = conn.cursor() cursor.execute(TABLE_DEF.format(db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL)) cursor.close() conn.close()
def find_shard_mismatches(instance=False): """ Find shards that are missing or unexpected in modhsarddb and sharddb Args: instance - If supplied, only check this instance. Returns: orphaned - A dict of unexpected and (according to table statistics) unused shards. Key is master instance, value is a set. orphaned_but_used - A dict of unexpected and but used shards. Data strucutre is the same as orphaned. missing - A dict of expected but missing shards. Data strucutre is the same as orphaned. """ orphaned = dict() orphaned_but_used = dict() missing_shards = dict() zk = host_utils.MysqlZookeeper() host_shard_map = zk.get_host_shard_map() if instance: new_host_shard_map = dict() new_host_shard_map[instance.__str__()] = host_shard_map[ instance.__str__()] host_shard_map = new_host_shard_map for master in host_shard_map: expected_shards = host_shard_map[master] instance = host_utils.HostAddr(master) conn = mysql_lib.connect_mysql(instance) activity = mysql_lib.get_dbs_activity(conn) actual_shards = mysql_lib.get_dbs(conn) unexpected_shards = actual_shards.difference(expected_shards) missing = expected_shards.difference(actual_shards) if missing: missing_shards[master] = expected_shards.difference(actual_shards) for db in unexpected_shards: if activity[db]['ROWS_CHANGED'] != 0: if master not in orphaned_but_used: orphaned_but_used[master] = set() orphaned_but_used[master].add(db) else: if master not in orphaned: orphaned[master] = set() orphaned[master].add(db) return orphaned, orphaned_but_used, missing_shards
def find_shard_mismatches(instance=False): """ Find shards that are missing or unexpected in modhsarddb and sharddb Args: instance - If supplied, only check this instance. Returns: orphaned - A dict of unexpected and (according to table statistics) unused shards. Key is master instance, value is a set. orphaned_but_used - A dict of unexpected and but used shards. Data strucutre is the same as orphaned. missing - A dict of expected but missing shards. Data strucutre is the same as orphaned. """ orphaned = dict() orphaned_but_used = dict() missing_shards = dict() zk = host_utils.MysqlZookeeper() host_shard_map = zk.get_host_shard_map() if instance: new_host_shard_map = dict() new_host_shard_map[instance.__str__()] = host_shard_map[instance.__str__()] host_shard_map = new_host_shard_map for master in host_shard_map: expected_shards = host_shard_map[master] instance = host_utils.HostAddr(master) conn = mysql_lib.connect_mysql(instance) activity = mysql_lib.get_dbs_activity(conn) actual_shards = mysql_lib.get_dbs(conn) unexpected_shards = actual_shards.difference(expected_shards) missing = expected_shards.difference(actual_shards) if missing: missing_shards[master] = expected_shards.difference(actual_shards) for db in unexpected_shards: if activity[db]['ROWS_CHANGED'] != 0: if master not in orphaned_but_used: orphaned_but_used[master] = set() orphaned_but_used[master].add(db) else: if master not in orphaned: orphaned[master] = set() orphaned[master].add(db) return orphaned, orphaned_but_used, missing_shards
def get_master_mysql_major_version(instance): """ Given an instance, determine the mysql major version for the master of the replica set. Args: instance - a hostaddr object Returns - A string similar to '5.5' or '5.6' """ zk = host_utils.MysqlZookeeper() master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master) mysql_version = mysql_lib.get_global_variables(master_conn)['version'][:3] return mysql_version
def release_expired_locks(self): """ Release any expired locks """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE expires < NOW()' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql) master_conn.commit() log.debug(cursor._executed)
def purge_old_expired_locks(self): """ Delete any locks older than a week """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() sql = ('DELETE FROM {db}.{tbl} ' 'WHERE expires < NOW() - INTERVAL 1 WEEK AND ' ' lock_active is NOT NULL ' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql) master_conn.commit() log.debug(cursor._executed)
def purge_old_expired_locks(self): """ Delete any locks older than 2 days """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() sql = ('DELETE FROM {db}.{tbl} ' 'WHERE expires < NOW() - INTERVAL 2 DAY' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql) master_conn.commit() log.debug(cursor._executed)
def create_checksum_detail_table(instance): """ Args: instance: the master instance for this replica set Returns: Nothing. If this fails, throw an exception. """ try: conn = mysql_lib.connect_mysql(instance, 'scriptrw') cursor = conn.cursor() cursor.execute(TABLE_DEF.format(db=mysql_lib.METADATA_DB, tbl=CHECKSUM_TBL)) cursor.close() conn.close() except Exception as e: raise Exception("Failed to create checksum detail " "table: {e}".format(e=e))
def release_expired_locks(self): """ Release any expired locks """ zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='dbascript') cursor = master_conn.cursor() sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL, released = NOW() ' 'WHERE expires < NOW() AND lock_active IS NOT NULL' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql) master_conn.commit() log.debug(cursor._executed)
def purge_old_expired_locks(self): """ Delete any locks older than a week """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() sql = ('DELETE FROM {db}.{tbl} ' 'WHERE expires < NOW() - INTERVAL 1 WEEK AND ' ' lock_active is NOT NULL ' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql) master_conn.commit() log.debug(cursor._executed)
def drop_db_after_rename(instance, dbs, verbose, dry_run): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # make sure the original db is empty for db in dbs: if mysql_lib.get_tables(instance, db): print ''.join(("Cowardly refusing to drop non-empty db:", db)) sys.exit(1) conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: # we should be good to drop the old empty dbs raw_sql = 'DROP DATABASE IF EXISTS `{db}`;' sql = raw_sql.format(db=db) if verbose: print sql if not dry_run: cursor.execute(sql) # and we should be ok to drop the non-empty 'dropme_' prepended db renamed_db = ''.join((DB_PREPEND, db)) sql = raw_sql.format(db=renamed_db) if verbose: print sql if not dry_run: cursor.execute(sql)
def log_table_sizes(port): """ Determine and record the size of tables on a MySQL instance Args: port - int """ instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port))) zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, TABLE_SIZE_TBL): create_table_size_table(master) sizes = get_all_table_sizes(instance) conn = mysql_lib.connect_mysql(master, 'dbascript') for db in sizes: for table in sizes[db]: for partition in sizes[db][table]: cursor = conn.cursor() sql = ('REPLACE INTO {metadata_db}.{tbl} ' 'SET ' 'hostname = %(hostname)s, ' 'port = %(port)s, ' 'db = %(db)s, ' 'table_name = %(table)s, ' 'partition_name = %(partition)s, ' 'reported_at = CURDATE(), ' 'size_mb = %(size)s ') cursor.execute( sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), { 'hostname': instance.hostname, 'port': instance.port, 'db': db, 'table': table, 'partition': partition, 'size': sizes[db][table][partition] }) conn.commit() log.info(cursor._executed) cursor.close()
def drop_db_after_rename(instance, dbs, verbose, dry_run): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # make sure the original db is empty conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: if mysql_lib.get_tables(conn, db): print ''.join(("Cowardly refusing to drop non-empty db:", db)) sys.exit(1) for db in dbs: # we should be good to drop the old empty dbs raw_sql = 'DROP DATABASE IF EXISTS `{db}`;' sql = raw_sql.format(db=db) if verbose: print sql if not dry_run: cursor.execute(sql) # and we should be ok to drop the non-empty 'dropme_' prepended db renamed_db = ''.join((DB_PREPEND, db)) sql = raw_sql.format(db=renamed_db) if verbose: print sql if not dry_run: cursor.execute(sql)
def mysql_backup_csv_tables(self): """ Worker for backing up a queue of tables """ proc_id = multiprocessing.current_process().name conn = mysql_lib.connect_mysql(self.instance, backup.USER_ROLE_MYSQLDUMP) mysql_lib.start_consistent_snapshot(conn, read_only=True, session_id=self.session_id) pitr_data = mysql_lib.get_pitr_data(self.instance) err_count = 0 while not (self.tables_to_backup.empty() and self.tables_to_retry.empty()): table_tuple = self.tables_to_retry.get() if not self.tables_to_retry.empty() \ else self.tables_to_backup.get() try: # if this is a partitioned table, and it is already # being backed up on some other host, we do not want to attempt # to back it up here. # if table_tuple[1] and self.partition_lock_exists(table_tuple): log.debug('Partitioned table {} is already being ' 'backed up elsewhere, so we cannot do it ' 'here.'.format(table_tuple[0])) else: self.mysql_backup_csv_table_wrapper( table_tuple, conn, pitr_data) self.table_count = self.table_count + 1 if (self.table_count % 50) == 0: self.release_expired_locks() except: self.tables_to_retry.put(table_tuple) log.error('{proc_id}: Could not dump {tbl}, partition {p} - ' 'error: {e}'.format(tbl=table_tuple[0], p=table_tuple[2], e=traceback.format_exc(), proc_id=proc_id)) err_count = err_count + 1 if err_count > MAX_THREAD_ERROR: log.error('{}: Error count in thread > MAX_THREAD_ERROR. ' 'Aborting :('.format(proc_id)) return
def log_table_sizes(port): """ Determine and record the size of tables on a MySQL instance Args: port - int """ instance = host_utils.HostAddr(':'.join((host_utils.HOSTNAME, port))) zk = host_utils.MysqlZookeeper() replica_set = instance.get_zk_replica_set()[0] master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) if not mysql_lib.does_table_exist(master, mysql_lib.METADATA_DB, TABLE_SIZE_TBL): create_table_size_table(master) sizes = get_all_table_sizes(instance) conn = mysql_lib.connect_mysql(master, 'scriptrw') for db in sizes: for table in sizes[db]: for partition in sizes[db][table]: cursor = conn.cursor() sql = ('REPLACE INTO {metadata_db}.{tbl} ' 'SET ' 'hostname = %(hostname)s, ' 'port = %(port)s, ' 'db = %(db)s, ' 'table_name = %(table)s, ' 'partition_name = %(partition)s, ' 'reported_at = CURDATE(), ' 'size_mb = %(size)s ') cursor.execute(sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), {'hostname': instance.hostname, 'port': instance.port, 'db': db, 'table': table, 'partition': partition, 'size': sizes[db][table][partition]}) conn.commit() log.info(cursor._executed) cursor.close()
def verify_blackhole_dbs(destination, non_mig_databases): """ Confirm that non migrated tables have no non-blackhole tables Args: destination - A hostaddr object non_mig_databases - A set of dbs to check """ conn = mysql_lib.connect_mysql(destination) cursor = conn.cursor() query = ("SELECT COUNT(*) AS 'tbls' " "FROM information_schema.tables " "WHERE ENGINE !='BLACKHOLE'" " AND TABLE_SCHEMA=%(db)s") for db in non_mig_databases: cursor.execute(query, {'db': db}) check = cursor.fetchone() if check['tbls']: raise Exception('Blackhole db {db} has non blackhole table on ' 'instance {i}' ''.format(db=db, i=destination))
def confirm_replica_topology(master, replicas): """ Confirm that replica servers are actually replicating off of a master Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instance """ for replica in replicas: conn = mysql_lib.connect_mysql(replica) ss = mysql_lib.get_slave_status(conn) repl_master = host_utils.HostAddr(':'.join( (ss['Master_Host'], str(ss['Master_Port'])))) if repl_master != master: raise Exception('Slave {replica} is not a replica of master ' '{master}, but is instead a replica of ' '{repl_master}'.format(replica=replica, repl_master=repl_master, master=master)) else: log.info('Replica {replica} is replicating from expected master ' 'server {master}'.format(replica=replica, master=master))
def release_db_backup_lock(self, lock_identifier): """ Release a backup lock created by take_backup_lock Args: lock_identifier - a uuid to identify a lock row """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL AND released = NOW() ' 'WHERE lock_identifier = %(lock_identifier)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit() log.debug(cursor._executed)
def write_checksum_status(instance, data): """ Args: instance: Host info for the master that we'll connect to. data: A dictionary containing the row to insert. See the table definition at the top of the script for info. Returns: Nothing """ try: conn = mysql_lib.connect_mysql(instance, 'scriptrw') cursor = conn.cursor() sql = ("INSERT INTO test.checksum_detail SET " "reported_at=NOW(), " "instance=%(instance)s, " "master_instance=%(master_instance)s, " "db=%(db)s, tbl=%(tbl)s, " "elapsed_time_ms=%(elapsed_time_ms)s, " "chunk_count=%(chunk_count)s, " "chunk_errors=%(chunk_errors)s, " "chunk_diffs=%(chunk_diffs)s, " "chunk_skips=%(chunk_skips)s, " "row_count=%(row_count)s, " "row_diffs=%(row_diffs)s, " "rows_checked=%(rows_checked)s, " "checksum_status=%(checksum_status)s, " "checksum_cmd=%(checksum_cmd)s, " "checksum_stdout=%(checksum_stdout)s, " "checksum_stderr=%(checksum_stderr)s, " "checksum_rc=%(checksum_rc)s, " "sync_cmd=%(sync_cmd)s, " "sync_stdout=%(sync_stdout)s, " "sync_stderr=%(sync_stderr)s, " "sync_rc=%(sync_rc)s") cursor.execute(sql, data) except Exception as e: log.error("Unable to write to the database: {e}".format(s=sql, e=e)) finally: conn.commit() conn.close()
def confirm_replica_topology(master, replicas): """ Confirm that replica servers are actually replicating off of a master Args: master - A hostaddr object for the master instance replicas - A set of hostaddr objects for the replica instance """ for replica in replicas: conn = mysql_lib.connect_mysql(replica) ss = mysql_lib.get_slave_status(conn) repl_master = host_utils.HostAddr(':'.join((ss['Master_Host'], str(ss['Master_Port'])))) if repl_master != master: raise Exception('Slave {replica} is not a replica of master ' '{master}, but is instead a replica of ' '{repl_master}'.format(replica=replica, repl_master=repl_master, master=master)) else: log.info('Replica {replica} is replicating from expected master ' 'server {master}'.format(replica=replica, master=master))
def release_db_backup_lock(self, lock_identifier): """ Release a backup lock created by take_backup_lock Args: lock_identifier - a uuid to identify a lock row """ zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(self.instance) master = zk.get_mysql_instance_from_replica_set( replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE {db}.{tbl} ' 'SET lock_active = NULL AND released = NOW() ' 'WHERE lock_identifier = %(lock_identifier)s' '').format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, params) master_conn.commit() log.debug(cursor._executed)
def get_master_mysql_major_version(instance): """ Given an instance, determine the mysql major version for the master of the replica set. Args: instance - a hostaddr object Returns - A string similar to '5.5' or '5.6' """ zk = host_utils.MysqlZookeeper() master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) try: master_conn = mysql_lib.connect_mysql(master) except _mysql_exceptions.OperationalError: raise Exception('Could not connect to master server {instance} in ' 'order to determine MySQL version to launch with. ' 'Perhaps run this script from there? This is likely ' 'due to firewall rules.' ''.format(instance=instance.hostname)) mysql_version = mysql_lib.get_global_variables(master_conn)['version'][:3] return mysql_version
def mysql_backup_csv_dbs(self): """ Worker for backing up a queue of dbs """ proc_id = multiprocessing.current_process().name conn = mysql_lib.connect_mysql(self.instance, backup.USER_ROLE_MYSQLDUMP) mysql_lib.start_consistent_snapshot(conn, read_only=True) pitr_data = mysql_lib.get_pitr_data(self.instance) err_count = 0 while not self.dbs_to_backup.empty(): db = self.dbs_to_backup.get() try: self.mysql_backup_csv_db(db, conn, pitr_data) except: self.dbs_to_backup.put(db) log.error('{proc_id}: Could not dump {db}, ' 'error: {e}'.format(db=db, e=traceback.format_exc(), proc_id=proc_id)) err_count = err_count + 1 if err_count > MAX_THREAD_ERROR: log.error('{proc_id}: Error count in thread > MAX_THREAD_ERROR. ' 'Aborting :('.format(proc_id=proc_id)) return
def sanity_check_replica(instance): """ Make sure a slave is slaving and relatively caught up Args: instance - A hostaddr object Returns: A hostaddr object of master of the instance argument """ # Test to see if the slave is setup for replication. If not, we are hosed conn = mysql_lib.connect_mysql(instance) try: mysql_lib.get_master_status(conn) except mysql_lib.ReplicationError: raise Exception('{instance} is not setup to write replicaiton ' 'logs!'.format(instance=instance)) replication = mysql_lib.calc_slave_lag(instance) if replication['ss']['Slave_SQL_Running'] != 'Yes': raise Exception('SQL thread is not running on {instance}' ''.format(instance=instance)) if replication['ss']['Slave_IO_Running'] != 'Yes': raise Exception('IO thread is not running on {instance}' ''.format(instance=instance)) if replication['sbm'] > mysql_lib.MAX_HEARTBEAT_LAG: raise Exception('Heartbeat lag {sbm} > {max_lag} seconds' ''.format(sbm=replication['sbm'], max_lag=mysql_lib.MAX_HEARTBEAT_LAG)) if replication['io_bytes'] > mysql_lib.MAX_IO_LAG: raise Exception('IO lag {io_bytes} > {max_io} bytes' ''.format(io_bytes=replication['io_bytes'], max_io=mysql_lib.MAX_IO_LAG)) master = host_utils.HostAddr(':'.join((replication['ss']['Master_Host'], str(replication['ss']['Master_Port'])))) return master
def get_logged_binlog_uploads(instance): """ Get all binlogs that have been logged as uploaded Args: instance - a hostAddr object to run against and check Returns: A set of binlog file names """ conn = mysql_lib.connect_mysql(instance, 'scriptro') cursor = conn.cursor() sql = ("SELECT binlog " "FROM {metadata_db}.{tbl} " "WHERE hostname = %(hostname)s AND " " port = %(port)s " "".format(metadata_db=mysql_lib.METADATA_DB, tbl=environment_specific.BINLOG_ARCHIVING_TABLE_NAME)) cursor.execute(sql, {'hostname': instance.hostname, 'port': str(instance.port)}) ret = set() for binlog in cursor.fetchall(): ret.add(binlog['binlog']) return ret
def take_backup_lock(self, db): """ Write a lock row on to the master Args: db - the db to be backed up Returns: a uuid lock identifier """ zk = host_utils.MysqlZookeeper() (replica_set, _) = self.instance.get_zk_replica_set() master = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master, role='scriptrw') cursor = master_conn.cursor() lock_identifier = str(uuid.uuid4()) log.debug('Taking backup lock: {replica_set} {db} ' ''.format(replica_set=replica_set, db=db)) params = {'lock': lock_identifier, 'db': db, 'hostname': self.instance.hostname, 'port': self.instance.port, 'active': ACTIVE} sql = ("INSERT INTO {db}.{tbl} " "SET " "lock_identifier = %(lock)s, " "lock_active = %(active)s, " "created_at = NOW(), " "expires = NOW() + INTERVAL {locks_held_time}, " "released = NULL, " "db = %(db)s," "hostname = %(hostname)s," "port = %(port)s" "").format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME, locks_held_time=LOCKS_HELD_TIME) cursor = master_conn.cursor() try: cursor.execute(sql, params) master_conn.commit() except _mysql_exceptions.IntegrityError: lock_identifier = None sql = ("SELECT hostname, port, expires " "FROM {db}.{tbl} " "WHERE " " lock_active = %(active)s AND " " db = %(db)s" "").format(db=mysql_lib.METADATA_DB, tbl=CSV_BACKUP_LOCK_TABLE_NAME) cursor.execute(sql, {'db': db, 'active': ACTIVE}) ret = cursor.fetchone() log.debug('DB {db} is already being backed up on {hostname}:{port}, ' 'lock will expire at {expires}.' ''.format(db=db, hostname=ret['hostname'], port=ret['port'], expires=str(ret['expires']))) log.debug(cursor._executed) return lock_identifier
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. conn = mysql_lib.connect_mysql(instance, 'scriptro') if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw') ss = mysql_lib.get_slave_status(slave_conn) if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes": raise Exception("Replication is NOT RUNNING on slave {s}: " "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'], it=ss['Slave_IO_Running'])) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(conn) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: conn = mysql_lib.connect_mysql(instance, 'scriptro') tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data) conn.close()
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave, trust_me_its_dead, kill_old_master): """ Promte a new MySQL master Args: master - Hostaddr object of the master instance to be demoted dry_run - Do not change state, just do sanity testing and exit skip_lock - Do not take a promotion lock ignore_dr_slave - Ignore the existance of a dr_slave trust_me_its_dead - Do not test to see if the master is dead kill_old_master - Send a mysqladmin kill command to the old master Returns: new_master - The new master server """ log.info('Master to demote is {master}'.format(master=master)) zk = host_utils.MysqlZookeeper() (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master']) log.info('Replica set is detected as ' '{replica_set}'.format(replica_set=replica_set)) # take a lock here to make sure nothing changes underneath us if not skip_lock and not dry_run: log.info('Taking promotion lock on replica set') lock_identifier = get_promotion_lock(replica_set) else: lock_identifier = None # giant try. If there any problems we roll back from the except try: master_conn = False slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) log.info('Slave/new master is detected as {slave}'.format(slave=slave)) if ignore_dr_slave: log.info('Intentionally ignoring a dr_slave') dr_slave = None else: dr_slave = zk.get_mysql_instance_from_replica_set(replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave)) if dr_slave: if dr_slave == slave: raise Exception('Slave and dr_slave appear to be the same') replicas = set([slave, dr_slave]) else: replicas = set([slave]) # let's make sure that what we think is the master, actually is confirm_replica_topology(master, replicas) # We use master_conn as a mysql connection to the master server, if # it is False, the master is dead if trust_me_its_dead: master_conn = None else: master_conn = is_master_alive(master, replicas) slave_conn = mysql_lib.connect_mysql(slave) # Test to see if the slave is setup for replication. If not, we are hosed log.info('Testing to see if Slave/new master is setup to write ' 'replication logs') try: mysql_lib.get_master_status(slave_conn) except mysql_lib.ReplicationError: log.error('New master {slave} is not setup to write replicaiton ' 'logs!'.format(slave=slave)) raise log.info('Slave/new master is setup to write replication logs') if kill_old_master: log.info('Killing old master, we hope you know what you are doing') mysql_lib.shutdown_mysql(master) master_conn = None if master_conn: log.info('Master is considered alive') dead_master = False confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) else: log.info('Master is considered dead') dead_master = True confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) if dry_run: log.info('In dry_run mode, so exiting now') # Using os._exit in order to not get catch in the giant try os._exit(0) log.info('Preliminary sanity checks complete, starting promotion') if master_conn: log.info('Setting read_only on master') mysql_lib.set_global_variable(master_conn, 'read_only', True) log.info('Confirming no writes to old master') # If there are writes with the master in read_only mode then the # promotion can not proceed. # A likely reason is a client has the SUPER privilege. confirm_no_writes(master_conn) log.info('Waiting for replicas to be caught up') confirm_max_replica_lag(replicas, 0, timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS, dead_master=dead_master) log.info('Setting up replication from old master ({master})' 'to new master ({slave})'.format(master=master, slave=slave)) mysql_lib.setup_replication(new_master=slave, new_replica=master) else: log.info('Starting up a zk connection to make sure we can connect') kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not conect to zk') log.info('Confirming replica has processed all replication ' ' logs') confirm_no_writes(slave_conn) log.info('Looks like no writes being processed by replica via ' 'replication or other means') if len(replicas) > 1: log.info('Confirming relpica servers in sync') confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS, replicas_synced=True, dead_master=dead_master) except: log.info('Starting rollback') if master_conn: log.info('Releasing read_only on old master') mysql_lib.set_global_variable(master_conn, 'read_only', False) log.info('Clearing replication settings on old master') mysql_lib.reset_slave(master_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Rollback complete, reraising exception') raise if dr_slave: try: mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave) except Exception as e: log.error(e) log.error('Setting up replication on the dr_slave failed. ' 'Failing forward!') log.info('Updating zk') zk_write_attempt = 0 while True: try: modify_mysql_zk.swap_master_and_slave(slave, dry_run=False) break except: if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS: log.info('Final failure writing to zk, bailing') raise else: log.info('Write to zk failed, trying again') zk_write_attempt = zk_write_attempt+1 log.info('Removing read_only from new master') mysql_lib.set_global_variable(slave_conn, 'read_only', False) log.info('Removing replication configuration from new master') mysql_lib.reset_slave(slave_conn) if lock_identifier: log.info('Releasing promotion lock') release_promotion_lock(lock_identifier) log.info('Failover complete') if not master_conn: log.info('As master is dead, will try to launch a replacement. Will ' 'sleep 20 seconds first to let things settle') time.sleep(20) launch_replacement_db_host.launch_replacement_db_host(master)