def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ if (date == (datetime.datetime.utcnow().date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")): if datetime.datetime.utcnow().time() < CSV_STARTUP: print 'Backup startup time has not yet passed' # For todays date, we give CSV_STARTUP minutes before checking anything. return True if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME: # For todays date, until after CSV_COMPLETION_TIME it is good enough # to check if backups are running. If they are running, everything # is ok. If they are not running, we will do all the normal checks. if csv_backups_running(instance): print 'Backup running on {i}'.format(i=instance) return True return_status = True for db in mysql_lib.get_dbs(instance): (success, _) = \ verify_csv_schema_upload(shard_type, date, instance, db, mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA) if not success: return_status = False if not return_status: print 'missing schema file' # problem with schema, don't bother verifying data return return_status boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): for table in mysql_lib.get_tables(instance, db, skip_views=True): key = mysql_backup_csv.PATH_DAILY_BACKUP.format(table=table, hostname_prefix=shard_type, date=date, db_name=db) if not bucket.get_key(key): missing_uploads.add(key) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) else: return True
def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ if (date == (datetime.datetime.utcnow().date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")): if datetime.datetime.utcnow().time() < CSV_STARTUP: print 'Backup startup time has not yet passed' # For todays date, we give CSV_STARTUP minutes before checking anything. return True if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME: # For todays date, until after CSV_COMPLETION_TIME it is good enough # to check if backups are running. If they are running, everything # is ok. If they are not running, we will do all the normal checks. if csv_backups_running(instance): print 'Backup running on {i}'.format(i=instance) return True return_status = True for db in mysql_lib.get_dbs(instance): (success, _) = \ verify_csv_schema_upload(shard_type, date, instance, db, mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA) if not success: return_status = False if not return_status: print 'missing schema file' # problem with schema, don't bother verifying data return return_status boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): for table in mysql_lib.get_tables(instance, db, skip_views=True): key = mysql_backup_csv.PATH_DAILY_BACKUP.format( table=table, hostname_prefix=shard_type, date=date, db_name=db) if not bucket.get_key(key): missing_uploads.add(key) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) else: return True
def __init__(self, instance, db=None, force_table=None, force_reupload=False): """ Init function for backup, takes all args Args: instance - A hostAddr obect of the instance to be baced up db - (option) backup only specified db force_table - (option) backup only specified table force_reupload - (optional) force reupload of backup """ self.instance = instance self.timestamp = datetime.datetime.utcnow() # datestamp is for s3 files which are by convention -1 day self.datestamp = (self.timestamp - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.dbs_to_backup = multiprocessing.Queue() if db: self.dbs_to_backup.put(db) else: for db in mysql_lib.get_dbs(self.instance): self.dbs_to_backup.put(db) self.force_table = force_table self.force_reupload = force_reupload
def collectTableStats(db): """ Collect table stats Args: db - a db object """ # First we are going to pull stats aggregated by schema # and namespace, if applicable global collection_time, last_collection_time instance = host_utils.HostAddr(':'.join((socket.gethostname(), db.port))) namespace_dbs_map = dict() non_namespace_dbs = set() for schema in mysql_lib.get_dbs(instance): namespace = get_namespace_from_schema(schema) if namespace: if namespace not in namespace_dbs_map: namespace_dbs_map[namespace] = set() namespace_dbs_map[namespace].add(schema) else: non_namespace_dbs.add(schema) for namespace in namespace_dbs_map: for row in get_tablestats(db, namespace_dbs_map[namespace]): printmetrics_tablestat(db, row, namespace) if non_namespace_dbs: for row in get_tablestats(db, non_namespace_dbs): printmetrics_tablestat(db, row) # next we want table stats aggregated by table and namespace. for namespace in namespace_dbs_map: for row in get_schemastats(db, namespace_dbs_map[namespace]): printmetrics_schemastats(db, row, namespace) if non_namespace_dbs: for row in get_schemastats(db, non_namespace_dbs): printmetrics_schemastats(db, row) db.query("FLUSH NO_WRITE_TO_BINLOG TABLE_STATISTICS")
def __init__(self, instance, db=None, force_table=None, force_reupload=False, dev_bucket=False): """ Init function for backup, takes all args Args: instance - A hostAddr obect of the instance to be baced up db - (option) backup only specified db force_table - (option) backup only specified table force_reupload - (optional) force reupload of backup """ self.instance = instance self.timestamp = datetime.datetime.utcnow() # datestamp is for s3 files which are by convention -1 day self.datestamp = (self.timestamp - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.dbs_to_backup = multiprocessing.Queue() if db: self.dbs_to_backup.put(db) else: for db in mysql_lib.get_dbs(self.instance): self.dbs_to_backup.put(db) self.force_table = force_table self.force_reupload = force_reupload if dev_bucket: self.upload_bucket = environment_specific.S3_CSV_BUCKET_DEV else: self.upload_bucket = environment_specific.S3_CSV_BUCKET
def check_instance_table(hostaddr, table, desired_hash): """ Check that a table on a MySQL instance has the expected schema Args: hostaddr - object describing which mysql instance to connect to table - the name of the table to verify desired_hash - the md5sum of the desired CREATE TABLE for the table Returns: A dictionary with keys that are the hash of the CREATE TABLE statement and the values are sets of hostname:port followed by a space and then the db one which the incorrect schema was found. """ ret = dict() conn = mysql_lib.connect_mysql(hostaddr) for db in mysql_lib.get_dbs(conn): definition = mysql_lib.show_create_table(conn, db, table) tbl_hash = hashlib.md5(definition).hexdigest() if tbl_hash != desired_hash: if tbl_hash not in ret: ret[tbl_hash] = set() ret[tbl_hash].add(''.join((hostaddr.__str__(), ' ', db))) return ret
def get_problem_replicasets(localpath, raw_mapping): problem_replica_sets = dict() zk = host_utils.MysqlZookeeper() replica_set_dbs = get_db_on_replica_set(localpath, raw_mapping) for replica_set in replica_set_dbs: master = zk.get_mysql_instance_from_replica_set( replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_MASTER) dbs = mysql_lib.get_dbs(master) missing = replica_set_dbs[replica_set].difference(dbs) if missing: problem_replica_sets[replica_set] = missing return problem_replica_sets
def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ return_status = True boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): tables = mysql_backup_csv.mysql_backup_csv( instance).get_tables_to_backup(db) for table in tables: if not verify_csv_schema_upload(shard_type, date, instance, db, set([table])): return_status = False print 'Missing schema for {db}.{table}'.format(db=db, table=table) continue (_, data_path, success_path) = \ environment_specific.get_csv_backup_paths(date, db, table, instance.replica_type, instance.get_zk_replica_set()[0]) if not bucket.get_key(data_path): missing_uploads.add(data_path) else: # we still need to create a success file for the data # team for this table, even if something else is AWOL # later in the backup. if bucket.get_key(success_path): print 'Key already exists {key}'.format(key=success_path) else: print 'Creating success key {key}'.format(key=success_path) key = bucket.new_key(success_path) key.set_contents_from_string('') if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) return_status = False return return_status
def find_shard_mismatches(instance=False): """ Find shards that are missing or unexpected in a sharded dataset Args: instance - If supplied, only check this instance. Returns: orphaned - A dict of unexpected and (according to table statistics) unused dbs. Key is master instance, value is a set. orphaned_but_used - A dict of unexpected and but used dbs. Data structure is the same as orphaned. missing - A dict of expected but missing dbs. Data structure is the same as orphaned. """ orphaned = dict() orphaned_but_used = dict() missing_dbs = dict() zk = host_utils.MysqlZookeeper() rs_dbs_map = zk.get_sharded_dbs_by_replica_set() if instance: rs = zk.get_replica_set_from_instance(instance) rs_dbs_map = {rs: rs_dbs_map[rs]} for rs in rs_dbs_map: # non-sharded replica sets if not len(rs_dbs_map[rs]): continue expected_dbs = rs_dbs_map[rs] instance = zk.get_mysql_instance_from_replica_set(rs) activity = mysql_lib.get_dbs_activity(instance) actual_dbs = mysql_lib.get_dbs(instance) unexpected_dbs = actual_dbs.difference(expected_dbs) missing = expected_dbs.difference(actual_dbs) if missing: missing_dbs[instance] = expected_dbs.difference(actual_dbs) for db in unexpected_dbs: if activity[db]['ROWS_CHANGED'] != 0: if instance not in orphaned_but_used: orphaned_but_used[instance] = set() orphaned_but_used[instance].add(db) else: if instance not in orphaned: orphaned[instance] = set() orphaned[instance].add(db) return orphaned, orphaned_but_used, missing_dbs
def find_shard_mismatches(instance=False): """ Find shards that are missing or unexpected in modhsarddb and sharddb Args: instance - If supplied, only check this instance. Returns: orphaned - A dict of unexpected and (according to table statistics) unused shards. Key is master instance, value is a set. orphaned_but_used - A dict of unexpected and but used shards. Data strucutre is the same as orphaned. missing - A dict of expected but missing shards. Data strucutre is the same as orphaned. """ orphaned = dict() orphaned_but_used = dict() missing_shards = dict() zk = host_utils.MysqlZookeeper() host_shard_map = zk.get_host_shard_map() if instance: new_host_shard_map = dict() new_host_shard_map[instance.__str__()] = host_shard_map[ instance.__str__()] host_shard_map = new_host_shard_map for master in host_shard_map: expected_shards = host_shard_map[master] instance = host_utils.HostAddr(master) conn = mysql_lib.connect_mysql(instance) activity = mysql_lib.get_dbs_activity(conn) actual_shards = mysql_lib.get_dbs(conn) unexpected_shards = actual_shards.difference(expected_shards) missing = expected_shards.difference(actual_shards) if missing: missing_shards[master] = expected_shards.difference(actual_shards) for db in unexpected_shards: if activity[db]['ROWS_CHANGED'] != 0: if master not in orphaned_but_used: orphaned_but_used[master] = set() orphaned_but_used[master].add(db) else: if master not in orphaned: orphaned[master] = set() orphaned[master].add(db) return orphaned, orphaned_but_used, missing_shards
def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ return_status = True boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): tables = mysql_backup_csv.mysql_backup_csv(instance).get_tables_to_backup(db) for table in tables: if not verify_csv_schema_upload(shard_type, date, instance, db, set([table])): return_status = False print 'Missing schema for {db}.{table}'.format(db=db, table=table) continue (_, data_path, success_path) = \ environment_specific.get_csv_backup_paths(date, db, table, instance.replica_type, instance.get_zk_replica_set()[0]) if not bucket.get_key(data_path): missing_uploads.add(data_path) else: # we still need to create a success file for the data # team for this table, even if something else is AWOL # later in the backup. if bucket.get_key(success_path): print 'Key already exists {key}'.format(key=success_path) else: print 'Creating success key {key}'.format(key=success_path) key = bucket.new_key(success_path) key.set_contents_from_string('') if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) return_status = False return return_status
def find_shard_mismatches(instance=False): """ Find shards that are missing or unexpected in modhsarddb and sharddb Args: instance - If supplied, only check this instance. Returns: orphaned - A dict of unexpected and (according to table statistics) unused shards. Key is master instance, value is a set. orphaned_but_used - A dict of unexpected and but used shards. Data strucutre is the same as orphaned. missing - A dict of expected but missing shards. Data strucutre is the same as orphaned. """ orphaned = dict() orphaned_but_used = dict() missing_shards = dict() zk = host_utils.MysqlZookeeper() host_shard_map = zk.get_host_shard_map() if instance: new_host_shard_map = dict() new_host_shard_map[instance.__str__()] = host_shard_map[instance.__str__()] host_shard_map = new_host_shard_map for master in host_shard_map: expected_shards = host_shard_map[master] instance = host_utils.HostAddr(master) conn = mysql_lib.connect_mysql(instance) activity = mysql_lib.get_dbs_activity(conn) actual_shards = mysql_lib.get_dbs(conn) unexpected_shards = actual_shards.difference(expected_shards) missing = expected_shards.difference(actual_shards) if missing: missing_shards[master] = expected_shards.difference(actual_shards) for db in unexpected_shards: if activity[db]['ROWS_CHANGED'] != 0: if master not in orphaned_but_used: orphaned_but_used[master] = set() orphaned_but_used[master].add(db) else: if master not in orphaned: orphaned[master] = set() orphaned[master].add(db) return orphaned, orphaned_but_used, missing_shards
def get_all_table_sizes(instance): """ Get size of all innodb tables NOTE: At this point tables should always be innodb NOTE2: file per table should always be on. Args: instance - A hostAddr object """ datadir = host_utils.get_cnf_setting('datadir', instance.port) ret = dict() for db in mysql_lib.get_dbs(instance): ret[db] = dict() db_dir = os.path.join(datadir, db) for table_path in glob.glob(''.join([db_dir, '/*', INNODB_EXTENSION])): (table, partition) = parse_table_file_name(table_path) if table not in ret[db]: ret[db][table] = dict() ret[db][table][partition] = os.stat(table_path).st_size/1024/1024 return ret
def drop_db_after_rename(instance, dbs=None, dry_run=False): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names dry_run - bool, will make no changes to the servers """ if not dbs: dbs = set() for db in mysql_lib.get_dbs(instance): if db.startswith(DB_PREPEND): dbs.add(db[len(DB_PREPEND):]) # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance] unexpected = dbs.difference(instance_orphans) if unexpected: raise Exception('Cowardly refusing to act on the following ' 'dbs: {}'.format(unexpected)) # make sure the original db is empty for db in dbs: if mysql_lib.get_tables(instance, db): raise Exception('Cowardly refusing to drop non-empty ' 'db: {}'.format(db)) for db in dbs: renamed_db = ''.join((DB_PREPEND, db)) if dry_run: log.info('dry_run is enabled, not dropping ' 'dbs: {db} {renamed}'.format(db=db, renamed=renamed_db)) else: mysql_lib.drop_db(instance, db) mysql_lib.drop_db(instance, renamed_db)
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. conn = mysql_lib.connect_mysql(instance, 'scriptro') if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw') ss = mysql_lib.get_slave_status(slave_conn) if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes": raise Exception("Replication is NOT RUNNING on slave {s}: " "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'], it=ss['Slave_IO_Running'])) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(conn) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: conn = mysql_lib.connect_mysql(instance, 'scriptro') tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data) conn.close()
def verify_flexsharded_csv_backup(shard_type, date, instance=None): """ Verify that a flexsharded data set has been backed up to hive Args: shard_type - i.e. 'commercefeeddb', etc date - The date to search for instance - Restrict the search to problem on a single instnace Returns True for no problems found, False otherwise. """ success = True replica_sets = set() zk = host_utils.MysqlZookeeper() if instance: replica_sets.add(zk.get_replica_set_from_instance(instance)[0]) else: for replica_set in zk.get_all_mysql_replica_sets(): if replica_set.startswith( environment_specific.FLEXSHARD_DBS[shard_type] ['zk_prefix']): replica_sets.add(replica_set) schema_host = zk.get_mysql_instance_from_replica_set( environment_specific.FLEXSHARD_DBS[shard_type] ['example_shard_replica_set'], repl_type=host_utils.REPLICA_ROLE_SLAVE) boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(schema_host): for table in mysql_backup_csv.mysql_backup_csv( schema_host).get_tables_to_backup(db): if not verify_csv_schema_upload(shard_type, date, schema_host, db, [table]): success = False continue table_missing_uploads = set() for replica_set in replica_sets: chk_instance = zk.get_mysql_instance_from_replica_set( replica_set) (_, data_path, success_path) = environment_specific.get_csv_backup_paths( date, db, table, chk_instance.replica_type, chk_instance.get_zk_replica_set()[0]) if not bucket.get_key(data_path): table_missing_uploads.add(data_path) success = False if not table_missing_uploads and not instance: if not bucket.get_key(success_path): print 'Creating success key {key}'.format(key=success_path) key = bucket.new_key(success_path) key.set_contents_from_string('') missing_uploads.update(table_missing_uploads) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print('Shard type {shard_type} is missing uploads:' ''.format(shard_type=shard_type)) pprint.pprint(missing_uploads) else: print('Shard type {shard_type} is missing {num} uploads' ''.format(num=len(missing_uploads), shard_type=shard_type)) if not missing_uploads and not instance and success: print 'Shard type {shard_type} is backed up'.format( shard_type=shard_type) return success
def start_shard_migration(source_replica_set, destination_replica_set, mig_dbs): """ Move shards from one replica set to another Args: source_replica_set - Which replica set to take the shards from destination_replica_set - Which replica set to put the shards on mig_dbs - A set of databases to be migrated """ # In 2017Q1 shardb and modsharddb will learn how to deal with shard # migrations. We will block them for now. if source_replica_set.startswith('db') or \ source_replica_set.startswith('moddb'): raise Exception('Sharddb and modsharddb migrations are not yet ' 'supported') if source_replica_set == destination_replica_set: raise Exception('Source and destination can not be the same!') # Dealing with failures, potentially due to failovers seems scary # here. We are intentionally not catching exception as this seems racy # and it would be far better for the entire process to fail than to mess # with replication during a failover. log.info('Requested to migrate from {s} to {d} databases: {db}' ''.format(s=source_replica_set, d=destination_replica_set, db=', '.join(mig_dbs))) zk = host_utils.MysqlZookeeper() source_master = zk.get_mysql_instance_from_replica_set(source_replica_set) source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_DR_SLAVE) if not source_slave: source_slave = zk.get_mysql_instance_from_replica_set( source_replica_set, host_utils.REPLICA_ROLE_SLAVE) log.info('Source host for dumping data {}'.format(source_slave)) destination_master = zk.get_mysql_instance_from_replica_set( destination_replica_set) log.info('Destination host for restoring data {}' ''.format(destination_master)) expected_dbs_on_source = zk.get_sharded_dbs_by_replica_set()[source_replica_set] non_mig_dbs = mysql_lib.get_dbs(source_slave).difference(mig_dbs) unexpected_dbs = mig_dbs.difference(expected_dbs_on_source) if unexpected_dbs: raise Exception('Unexpected database supplied for migraton: {}' ''.format(unexpected_dbs)) # Make sure there are no missing or extra shards precheck_schema(source_master) precheck_schema(destination_master) # Check disk space required_disk_space = get_required_disk_space(mig_dbs, source_master) available_disk_space = disk_space_available_for_migration(destination_master) if available_disk_space < required_disk_space: raise Exception('Insufficent disk space to migrate, ' 'available {a}MB, ' 'requred {r}MB' ''.format(a=available_disk_space, r=required_disk_space)) else: log.info('Disk space looks ok: ' 'available {a}MB, ' 'requred {r}MB' ''.format(a=available_disk_space, r=required_disk_space)) # Let's take out a lock to make sure we don't have multiple migrations # running on the same replica sets (either source or destination). lock_id = take_migration_lock(source_replica_set, destination_replica_set, mig_dbs, non_mig_dbs) try: if(non_mig_dbs): # First we will dump the schema for the shards that are not moving log.info('Backing up non-migrating schema: {}'.format(non_mig_dbs)) no_mig_backup = backup.logical_backup_instance( source_slave, time.localtime(), blackhole=True, databases=non_mig_dbs) time.sleep(1) # And next the metadata db log.info('Backing up metadata db: {}'.format(mysql_lib.METADATA_DB)) metadata_backup = backup.logical_backup_instance( source_slave, time.localtime(), databases=[mysql_lib.METADATA_DB]) time.sleep(1) # Next we will backup the data for the shards that are moving log.info('Backing up migrating schema data: {}'.format(mig_dbs)) mig_backup = backup.logical_backup_instance( source_slave, time.localtime(), databases=mig_dbs) except: finish_migration_log(lock_id, STATUS_EXPORT_FAILED) raise if(non_mig_dbs): # Finally import the backups log.info('Importing all the blackhole tables') mysql_restore.logical_restore(no_mig_backup, destination_master) log.info('Import metadata') mysql_restore.logical_restore(metadata_backup, destination_master) log.info('Setting up replication') mysql_lib.change_master(destination_master, source_master, 'BOGUS', 0, no_start=True, skip_set_readonly=True, gtid_auto_pos=False) mysql_restore.logical_restore(mig_backup, destination_master) # add start slave, catchup mysql_lib.start_replication(destination_master) mysql_lib.wait_for_catch_up(destination_master, migration=True) # And update the log/locks update_migration_status(lock_id, STATUS_FAILOVER_READY) log.info('The migration is ready to be finished by running:') log.info('/usr/local/bin/mysql_utils/finish_shard_migration.py {src}' ''.format(src=source_replica_set))
def verify_flexsharded_csv_backup(shard_type, date, dev_bucket=False): """ Verify that a flexsharded data set has been backed up to hive Args: shard_type - i.e. 'commercefeeddb', etc date - The date to search for dev_bucket - Look in the dev bucket? Returns: True for no problems found, False otherwise. """ success = True replica_sets = set() zk = host_utils.MysqlZookeeper() # Figure out what replica sets to check based on a prefix for replica_set in zk.get_all_mysql_replica_sets(): if replica_set.startswith( environment_specific.FLEXSHARD_DBS[shard_type]['zk_prefix']): replica_sets.add(replica_set) # Example schema host schema_host = zk.get_mysql_instance_from_replica_set( environment_specific.FLEXSHARD_DBS[shard_type] ['example_shard_replica_set'], repl_type=host_utils.REPLICA_ROLE_SLAVE) boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \ else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(schema_host): table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(schema_host, db, True) ] table_tuples = backup.filter_tables_to_csv_backup( schema_host, table_list) for t in table_tuples: try: verify_csv_schema_upload(schema_host, db, [t[0].split('.')[1]], date=date, dev_bucket=dev_bucket) except: continue table_missing_uploads = set() for replica_set in replica_sets: chk_instance = zk.get_mysql_instance_from_replica_set( replica_set) (_, data_path, success_path) = backup.get_csv_backup_paths( chk_instance, db, t[0].split('.')[1], date=date, partition_number=t[2]) k = bucket.get_key(data_path) if k is None: table_missing_uploads.add(data_path) success = False elif k.size == 0: # we should not have zero-length files, because even if # we send zero bytes to lzop, there's a 55-byte header. # so, if this actually happened, it probably means that # something is wrong. delete the key and add it to the # missing_uploads list so that we'll try again. k.delete() table_missing_uploads.add(data_path) success = False if not table_missing_uploads and not bucket.get_key(success_path): print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') missing_uploads.update(table_missing_uploads) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print('Shard type {} is missing uploads:'.format(shard_type)) pprint.pprint(missing_uploads) else: print('Shard type {shard_type} is missing {num} uploads' ''.format(num=len(missing_uploads), shard_type=shard_type)) if not missing_uploads and success: print 'Shard type {} is backed up'.format(shard_type) return success
def verify_unsharded_csv_backups(instance, date, dev_bucket=False): """ Verify csv backups for an instance which is not part of a sharded system Args: instance - The instance to inspect for backups being done date - The date to search for dev_bucket - Use the dev bucket? Returns: True for no problems found, False otherwise. """ return_status = True boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \ else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(instance, db, True) ] table_tuples = backup.filter_tables_to_csv_backup(instance, table_list) try: verify_csv_schema_upload( instance, db, [x[0].split('.')[1] for x in table_tuples], date, dev_bucket) except Exception as e: print e return_status = False continue table_names = [x[0] for x in table_tuples] expected_partitions = dict( (x, table_names.count(x)) for x in table_names) found_partitions = dict() for t in table_tuples: (_, data_path, success_path) = \ backup.get_csv_backup_paths(instance, *t[0].split('.'), date=date, partition_number=t[2]) k = bucket.get_key(data_path) if k is None: missing_uploads.add(data_path) elif k.size == 0: # we should not have zero-length files, because even if # we send zero bytes to lzop, there's a 55-byte header. # so, if this actually happened, it probably means that # something is wrong. delete the key and add it to the # missing_uploads list so that we'll try again. k.delete() missing_uploads.add(data_path) else: found_partitions[t[0]] = 1 + found_partitions.get(t[0], 0) # We still need to create a success file for the data # team for this table, even if something else is AWOL # later in the backup. s_key = bucket.get_key(success_path) if s_key: if found_partitions.get(t[0], 0) < expected_partitions[t[0]]: print( 'Success key {b}/{k} exists but it should ' 'not - deleting it!').format(b=bucket_name, k=success_path) s_key.delete() elif found_partitions.get(t[0], 0) == expected_partitions[t[0]]: print 'Success key {b}/{k} exists!'.format(b=bucket_name, k=success_path) elif found_partitions.get(t[0], 0) == expected_partitions[t[0]]: print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {}'.format(missing_uploads) else: print 'Missing {} uploads'.format(len(missing_uploads)) return_status = False if return_status: log_csv_backup_success(instance, date, dev_bucket) return return_status
except socket.error, (code, msg): log.error("Unable to bind socket for checksum on {rs} " "(msg: {m}, code:{c})".format(rs=replica_set, m=msg, c=code)) sys.exit(code) log.info("Locked replica set {} for checksum on this " "server".format(replica_set)) # before we even start this, make sure replication is OK. for slave in slaves: mysql_lib.assert_replication_sanity(slave) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(instance) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1
def verify_flexsharded_csv_backup(shard_type, date, instance=None): """ Verify that a flexsharded data set has been backed up to hive Args: shard_type - i.e. 'commercefeeddb', etc date - The date to search for instance - Restrict the search to problem on a single instnace Returns True for no problems found, False otherwise. """ success = True replica_sets = set() zk = host_utils.MysqlZookeeper() if instance: replica_sets.add(zk.get_replica_set_from_instance(instance)[0]) else: for replica_set in zk.get_all_mysql_replica_sets(): if replica_set.startswith(environment_specific.FLEXSHARD_DBS[shard_type]['zk_prefix']): replica_sets.add(replica_set) schema_host = zk.get_mysql_instance_from_replica_set( environment_specific.FLEXSHARD_DBS[shard_type]['example_shard_replica_set'], repl_type=host_utils.REPLICA_ROLE_SLAVE) boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(schema_host): for table in mysql_backup_csv.mysql_backup_csv(schema_host).get_tables_to_backup(db): if not verify_csv_schema_upload(shard_type, date, schema_host, db, [table]): success = False continue table_missing_uploads = set() for replica_set in replica_sets: chk_instance = zk.get_mysql_instance_from_replica_set(replica_set) (_, data_path, success_path) = environment_specific.get_csv_backup_paths( date, db, table, chk_instance.replica_type, chk_instance.get_zk_replica_set()[0]) if not bucket.get_key(data_path): table_missing_uploads.add(data_path) success = False if not table_missing_uploads and not instance: if not bucket.get_key(success_path): print 'Creating success key {key}'.format(key=success_path) key = bucket.new_key(success_path) key.set_contents_from_string('') missing_uploads.update(table_missing_uploads) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print ('Shard type {shard_type} is missing uploads:' ''.format(shard_type=shard_type)) pprint.pprint(missing_uploads) else: print ('Shard type {shard_type} is missing {num} uploads' ''.format(num=len(missing_uploads), shard_type=shard_type)) if not missing_uploads and not instance and success: print 'Shard type {shard_type} is backed up'.format(shard_type=shard_type) return success