def get_tables_to_backup(self, db): """ Determine which tables should be backed up in a db Returns: a set of table names """ if self.force_table: if self.force_table not in mysql_lib.get_tables(self.instance, db, skip_views=True): raise Exception('Requested table {t} does not exist in db {d}' ''.format(t=self.force_table, d=db)) return set([self.force_table]) else: return mysql_lib.get_tables(self.instance, db, skip_views=True)
def get_db_size_from_log(instance, db): """ Get yesterdays db size for an instance Args: instance - A hostaddr object db - A database that exists on the instance Returns: size in MB """ conn = mysql_lib.connect_mysql(instance, 'dbascript') cursor = conn.cursor() sql = ("SELECT SUM(size_mb) as 'mb', " " COUNT(1) as 'table_count' " "FROM {metadata_db}.{tbl} " "WHERE db = %(db)s " " AND reported_at=CURDATE() - INTERVAL 1 DAY " " AND hostname=%(hostname)s and port=%(port)s " "GROUP BY db;") params = {'hostname': instance.hostname, 'port': instance.port, 'db': db} cursor.execute( sql.format(metadata_db=mysql_lib.METADATA_DB, tbl=TABLE_SIZE_TBL), params) ret = cursor.fetchone() expected_tables = mysql_lib.get_tables(instance, db, skip_views=True) if ret['table_count'] != len(expected_tables): raise Exception('Size data appears to be missing for {db} on {inst}' ''.format(db=db, inst=instance)) return ret['mb']
def verify_csv_schema_upload(shard_type, date, schema_host, schema_db, schema_upload_path_raw): """ Confirm that schema files are uploaded Args: shard_type - In this case, a hostname or shard type (generally one in the same) date - The date to search for schema_host - A for to examine to find which tables should exist schema_db - Which db to inxpect on schema_host schema_upload_path_raw - A string that can be format'ed in order to create a S3 key path Returns True for no problems found, False otherwise. """ boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) tables = mysql_lib.get_tables(schema_host, environment_specific.convert_shard_to_db(schema_db), skip_views=True) return_status = True for table in tables: path = schema_upload_path_raw.format(table=table, hostname_prefix=shard_type, date=date, db_name=schema_db) if not bucket.get_key(path): print 'Expected key {key} is missing'.format(key=path) return_status = False return return_status, tables
def verify_csv_schema_upload(shard_type, date, schema_host, schema_db, schema_upload_path_raw): """ Confirm that schema files are uploaded Args: shard_type - In this case, a hostname or shard type (generally one in the same) date - The date to search for schema_host - A for to examine to find which tables should exist schema_db - Which db to inxpect on schema_host schema_upload_path_raw - A string that can be format'ed in order to create a S3 key path Returns True for no problems found, False otherwise. """ boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) tables = mysql_lib.get_tables( schema_host, environment_specific.convert_shard_to_db(schema_db), skip_views=True) return_status = True for table in tables: path = schema_upload_path_raw.format(table=table, hostname_prefix=shard_type, date=date, db_name=schema_db) if not bucket.get_key(path): print 'Expected key {key} is missing'.format(key=path) return_status = False return return_status, tables
def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ if (date == (datetime.datetime.utcnow().date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")): if datetime.datetime.utcnow().time() < CSV_STARTUP: print 'Backup startup time has not yet passed' # For todays date, we give CSV_STARTUP minutes before checking anything. return True if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME: # For todays date, until after CSV_COMPLETION_TIME it is good enough # to check if backups are running. If they are running, everything # is ok. If they are not running, we will do all the normal checks. if csv_backups_running(instance): print 'Backup running on {i}'.format(i=instance) return True return_status = True for db in mysql_lib.get_dbs(instance): (success, _) = \ verify_csv_schema_upload(shard_type, date, instance, db, mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA) if not success: return_status = False if not return_status: print 'missing schema file' # problem with schema, don't bother verifying data return return_status boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): for table in mysql_lib.get_tables(instance, db, skip_views=True): key = mysql_backup_csv.PATH_DAILY_BACKUP.format(table=table, hostname_prefix=shard_type, date=date, db_name=db) if not bucket.get_key(key): missing_uploads.add(key) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) else: return True
def verify_unsharded_csv_backup(shard_type, date, instance): """ Verify that a non-sharded db has been backed up to hive Args: shard_type - In this case, a hostname prefix date - The date to search for instance - The actual instance to inspect for backups being done Returns True for no problems found, False otherwise. """ if (date == (datetime.datetime.utcnow().date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")): if datetime.datetime.utcnow().time() < CSV_STARTUP: print 'Backup startup time has not yet passed' # For todays date, we give CSV_STARTUP minutes before checking anything. return True if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME: # For todays date, until after CSV_COMPLETION_TIME it is good enough # to check if backups are running. If they are running, everything # is ok. If they are not running, we will do all the normal checks. if csv_backups_running(instance): print 'Backup running on {i}'.format(i=instance) return True return_status = True for db in mysql_lib.get_dbs(instance): (success, _) = \ verify_csv_schema_upload(shard_type, date, instance, db, mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA) if not success: return_status = False if not return_status: print 'missing schema file' # problem with schema, don't bother verifying data return return_status boto_conn = boto.connect_s3() bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): for table in mysql_lib.get_tables(instance, db, skip_views=True): key = mysql_backup_csv.PATH_DAILY_BACKUP.format( table=table, hostname_prefix=shard_type, date=date, db_name=db) if not bucket.get_key(key): missing_uploads.add(key) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {uploads}'.format(uploads=missing_uploads) else: print 'Missing {num} uploads'.format(num=len(missing_uploads)) else: return True
def drop_db_after_rename(instance, dbs, verbose, dry_run): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # make sure the original db is empty for db in dbs: if mysql_lib.get_tables(instance, db): print ''.join(("Cowardly refusing to drop non-empty db:", db)) sys.exit(1) conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: # we should be good to drop the old empty dbs raw_sql = 'DROP DATABASE IF EXISTS `{db}`;' sql = raw_sql.format(db=db) if verbose: print sql if not dry_run: cursor.execute(sql) # and we should be ok to drop the non-empty 'dropme_' prepended db renamed_db = ''.join((DB_PREPEND, db)) sql = raw_sql.format(db=renamed_db) if verbose: print sql if not dry_run: cursor.execute(sql)
def drop_db_after_rename(instance, dbs, verbose, dry_run): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names verbose - bool, will direct sql to stdout dry_run - bool, will make no changes to """ # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance.__str__()] unexpected = dbs.difference(instance_orphans) if unexpected: print ''.join(("Cowardly refusing to act on the following dbs: ", ','.join(unexpected))) sys.exit(1) # make sure the original db is empty conn = mysql_lib.connect_mysql(instance) cursor = conn.cursor() for db in dbs: if mysql_lib.get_tables(conn, db): print ''.join(("Cowardly refusing to drop non-empty db:", db)) sys.exit(1) for db in dbs: # we should be good to drop the old empty dbs raw_sql = 'DROP DATABASE IF EXISTS `{db}`;' sql = raw_sql.format(db=db) if verbose: print sql if not dry_run: cursor.execute(sql) # and we should be ok to drop the non-empty 'dropme_' prepended db renamed_db = ''.join((DB_PREPEND, db)) sql = raw_sql.format(db=renamed_db) if verbose: print sql if not dry_run: cursor.execute(sql)
def get_tables_to_backup(self, db): """ Determine which tables should be backed up in a db Args: db - The db for which we need a list of tables eligible for backup Returns: a set of table names """ tables = environment_specific.filter_tables_to_csv_backup( self.instance, db, mysql_lib.get_tables(self.instance, db, skip_views=True)) if not self.force_table: return tables if self.force_table not in tables: raise Exception('Requested table {t} is not available to backup' ''.format(t=self.force_table)) else: return set([self.force_table])
def __init__(self, instance, db=None, force_table=None, force_reupload=False, dev_bucket=False): """ Init function for backup, takes all args Args: instance - A hostAddr obect of the instance to be baced up db - (option) backup only specified db force_table - (option) backup only specified table force_reupload - (optional) force reupload of backup """ self.instance = instance self.session_id = None self.timestamp = datetime.datetime.utcnow() # datestamp is for s3 files which are by convention -1 day self.datestamp = (self.timestamp - datetime.timedelta(days=1)).strftime("%Y-%m-%d") self.tables_to_backup = multiprocessing.Queue() self.tables_to_retry = multiprocessing.Queue() if db: table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(instance, db, True) ] else: table_list = mysql_lib.get_all_tables_by_instance(instance) for t in backup.filter_tables_to_csv_backup(instance, table_list): self.tables_to_backup.put(t) self.dev_bucket = dev_bucket self.force_table = force_table self.force_reupload = force_reupload self.table_count = 0 self.upload_bucket = environment_specific.S3_CSV_BUCKET_DEV \ if dev_bucket else environment_specific.S3_CSV_BUCKET
def drop_db_after_rename(instance, dbs=None, dry_run=False): """ Drop the original empty db and a non-empty rename db Args: instance - a hostaddr object dbs - a set of database names dry_run - bool, will make no changes to the servers """ if not dbs: dbs = set() for db in mysql_lib.get_dbs(instance): if db.startswith(DB_PREPEND): dbs.add(db[len(DB_PREPEND):]) # confirm db is not in zk and not in use orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance) instance_orphans = orphaned[instance] unexpected = dbs.difference(instance_orphans) if unexpected: raise Exception('Cowardly refusing to act on the following ' 'dbs: {}'.format(unexpected)) # make sure the original db is empty for db in dbs: if mysql_lib.get_tables(instance, db): raise Exception('Cowardly refusing to drop non-empty ' 'db: {}'.format(db)) for db in dbs: renamed_db = ''.join((DB_PREPEND, db)) if dry_run: log.info('dry_run is enabled, not dropping ' 'dbs: {db} {renamed}'.format(db=db, renamed=renamed_db)) else: mysql_lib.drop_db(instance, db) mysql_lib.drop_db(instance, renamed_db)
def main(): description = ("MySQL checksum wrapper\n\n" "Wrapper of pt-table-checksum and pt-table-sync.\n" "Defaults to checksumming 1/{k}th of databases on instance.\n" "If diffs are found, use pt-table-sync to measure actual " "divergence,\nbut only if the number of diffs is between " "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION) parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-i', '--instance', help='Instance to act on if other than localhost:3306', default=''.join((socket.getfqdn(), ':3306'))) parser.add_argument('-a', '--all', help='Checksums all dbs rather than the default', action='store_true', default=False) parser.add_argument('-d', '--dbs', help=("Comma separated list of db's to check rather " "than the default"), default=False) parser.add_argument('-q', '--quiet', help=("Do not print output to stdout"), action='store_true', default=False) parser.add_argument('-m', '--min_diffs', help=("Do per-row check if chunk diff count is at " "least this value"), dest='min_diffs', default=MIN_DIFFS) parser.add_argument('-M', '--max_diffs', help=("Do not do per-row check if chunk diff count " "is greater than this value"), dest='max_diffs', default=MAX_DIFFS) parser.add_argument('-C', '--no_create_table', help=("If test.checksum_detail is missing, do " "not try to create it."), dest='create_table', action='store_false', default=True) parser.add_argument('-v', '--verbose', help=("Store raw output from PT tools in the DB?"), action='store_true', default=False) parser.add_argument('-c', '--check_fraction', help=('Check this fraction of databases.'), default=DB_CHECK_FRACTION) args = parser.parse_args() instance = host_utils.HostAddr(args.instance) zk = host_utils.MysqlZookeeper() if instance not in \ zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER): raise Exception("Instance is not a master in ZK") # If enabled, try to create the table that holds the checksum info. # If not enabled, make sure that the table exists. conn = mysql_lib.connect_mysql(instance, 'scriptro') if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL): if args.create_table: create_checksum_detail_table(instance) else: raise Exception("Checksum table not found. Unable to continue." "Consider not using the -C option or create it " "yourself.") # Determine what replica set we belong to and get a list of slaves. replica_set = zk.get_replica_set_from_instance(instance)[0] slaves = set() for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE: s = zk.get_mysql_instance_from_replica_set(replica_set, rtype) if s: slaves.add(s) if len(slaves) == 0: log.info("This server has no slaves. Nothing to do.") sys.exit(0) # before we even start this, make sure replication is OK. for slave in slaves: slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw') ss = mysql_lib.get_slave_status(slave_conn) if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes": raise Exception("Replication is NOT RUNNING on slave {s}: " "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'], it=ss['Slave_IO_Running'])) if args.dbs: db_to_check = set(args.dbs.split(',')) else: dbs = mysql_lib.get_dbs(conn) if args.all: db_to_check = dbs else: # default behaviour, check a given DB every N days based on # day of year. minimizes month-boundary issues. db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: conn = mysql_lib.connect_mysql(instance, 'scriptro') tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line) if results: chunk_errors = int(results[1]) row_count = int(results[3]) chunk_count = int(results[4]) chunk_skips = int(results[5]) for slave in slaves: rows_checked = 'NO' sync_cmd = "" sync_out = "" sync_err = "" sync_ret = -1 row_diffs = 0 elapsed_time_ms,\ chunk_diffs = check_one_replica(slave, db, tbl) # if we skipped some chunks or there were errors, # this means we can't have complete information about the # state of the replica. in the case of a hard error, # we'll just stop. in the case of a skipped chunk, we will # treat it as a different chunk for purposes of deciding # whether or not to do a more detailed analysis. # checkable_chunks = chunk_skips + chunk_diffs if chunk_errors > 0: checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS' elif checkable_chunks == 0: checksum_status = 'GOOD' else: if checkable_chunks > int(args.max_diffs): # too many chunk diffs, don't bother checking # further. not good. checksum_status = 'TOO_MANY_CHUNK_DIFFS' elif checkable_chunks < int(args.min_diffs): # some diffs, but not enough that we care. checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK' else: start_time = int(time.time()*1000) rows_checked = 'YES' # set the proper status - did we do a sync-based check # because of explicit diffs or because of skipped chunks? if chunk_diffs > 0: checksum_status = 'ROW_DIFFS_FOUND' else: checksum_status = 'CHUNKS_WERE_SKIPPED' sync_cmd, sync_out, sync_err, sync_ret, \ row_diffs = checksum_tbl_via_sync(slave, db, tbl) # Add in the time it took to do the sync. elapsed_time_ms += int(time.time()*1000) - start_time if not args.quiet: log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd)) log.info("Standard out:\n {out}".format(out=sync_out)) log.info("Standard error:\n {err}".format(err=sync_err)) log.info("Return code: {ret}".format(ret=sync_ret)) log.info("Row diffs found: {cnt}".format(cnt=row_diffs)) # Checksum process is complete, store the results. # data = {'instance': slave, 'master_instance': instance, 'db': db, 'tbl': tbl, 'elapsed_time_ms': elapsed_time_ms, 'chunk_count': chunk_count, 'chunk_errors': chunk_errors, 'chunk_diffs': chunk_diffs, 'chunk_skips': chunk_skips, 'row_count': row_count, 'row_diffs': row_diffs, 'rows_checked': rows_checked, 'checksum_status': checksum_status, 'checksum_cmd': None, 'checksum_stdout': None, 'checksum_stderr': None, 'checksum_rc': c_ret, 'sync_cmd': None, 'sync_stdout': None, 'sync_stderr': None, 'sync_rc': sync_ret} if args.verbose: data.update({'checksum_cmd': c_cmd, 'checksum_stdout': c_out, 'checksum_stderr': c_err, 'sync_cmd': sync_cmd, 'sync_stdout': sync_out, 'sync_stderr': sync_err, 'sync_rc': sync_ret}) write_checksum_status(instance, data) conn.close()
db_to_check = set() check_modulus = int(time.strftime("%j")) % int(args.check_fraction) counter = 0 for db in dbs: modulus = counter % int(args.check_fraction) if modulus == check_modulus: db_to_check.add(db) counter = counter + 1 # Iterate through the list of DBs and check one table at a time. # We do it this way to ensure more coverage in case pt-table-checksum # loses its DB connection and errors out before completing a full scan # of a given database. # for db in db_to_check: tables_to_check = mysql_lib.get_tables(instance, db, skip_views=True) for tbl in tables_to_check: c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl) if not args.quiet: log.info( "Checksum command executed was:\n{cmd}".format(cmd=c_cmd)) log.info("Standard out:\n{out}".format(out=c_out)) log.info("Standard error:\n{err}".format(err=c_err)) log.info("Return code: {ret}".format(ret=c_ret)) # parse each line of STDOUT (there should only be one with # actual data). We only care about errors, rows, chunks, and # skipped, since we'll need to figure out diffs separately for # each slave box. for line in c_out.split("\n"): results = parse_checksum_row(line)
def verify_flexsharded_csv_backup(shard_type, date, dev_bucket=False): """ Verify that a flexsharded data set has been backed up to hive Args: shard_type - i.e. 'commercefeeddb', etc date - The date to search for dev_bucket - Look in the dev bucket? Returns: True for no problems found, False otherwise. """ success = True replica_sets = set() zk = host_utils.MysqlZookeeper() # Figure out what replica sets to check based on a prefix for replica_set in zk.get_all_mysql_replica_sets(): if replica_set.startswith( environment_specific.FLEXSHARD_DBS[shard_type]['zk_prefix']): replica_sets.add(replica_set) # Example schema host schema_host = zk.get_mysql_instance_from_replica_set( environment_specific.FLEXSHARD_DBS[shard_type] ['example_shard_replica_set'], repl_type=host_utils.REPLICA_ROLE_SLAVE) boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \ else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(schema_host): table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(schema_host, db, True) ] table_tuples = backup.filter_tables_to_csv_backup( schema_host, table_list) for t in table_tuples: try: verify_csv_schema_upload(schema_host, db, [t[0].split('.')[1]], date=date, dev_bucket=dev_bucket) except: continue table_missing_uploads = set() for replica_set in replica_sets: chk_instance = zk.get_mysql_instance_from_replica_set( replica_set) (_, data_path, success_path) = backup.get_csv_backup_paths( chk_instance, db, t[0].split('.')[1], date=date, partition_number=t[2]) k = bucket.get_key(data_path) if k is None: table_missing_uploads.add(data_path) success = False elif k.size == 0: # we should not have zero-length files, because even if # we send zero bytes to lzop, there's a 55-byte header. # so, if this actually happened, it probably means that # something is wrong. delete the key and add it to the # missing_uploads list so that we'll try again. k.delete() table_missing_uploads.add(data_path) success = False if not table_missing_uploads and not bucket.get_key(success_path): print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') missing_uploads.update(table_missing_uploads) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print('Shard type {} is missing uploads:'.format(shard_type)) pprint.pprint(missing_uploads) else: print('Shard type {shard_type} is missing {num} uploads' ''.format(num=len(missing_uploads), shard_type=shard_type)) if not missing_uploads and success: print 'Shard type {} is backed up'.format(shard_type) return success
def verify_unsharded_csv_backups(instance, date, dev_bucket=False): """ Verify csv backups for an instance which is not part of a sharded system Args: instance - The instance to inspect for backups being done date - The date to search for dev_bucket - Use the dev bucket? Returns: True for no problems found, False otherwise. """ return_status = True boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \ else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) missing_uploads = set() for db in mysql_lib.get_dbs(instance): table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(instance, db, True) ] table_tuples = backup.filter_tables_to_csv_backup(instance, table_list) try: verify_csv_schema_upload( instance, db, [x[0].split('.')[1] for x in table_tuples], date, dev_bucket) except Exception as e: print e return_status = False continue table_names = [x[0] for x in table_tuples] expected_partitions = dict( (x, table_names.count(x)) for x in table_names) found_partitions = dict() for t in table_tuples: (_, data_path, success_path) = \ backup.get_csv_backup_paths(instance, *t[0].split('.'), date=date, partition_number=t[2]) k = bucket.get_key(data_path) if k is None: missing_uploads.add(data_path) elif k.size == 0: # we should not have zero-length files, because even if # we send zero bytes to lzop, there's a 55-byte header. # so, if this actually happened, it probably means that # something is wrong. delete the key and add it to the # missing_uploads list so that we'll try again. k.delete() missing_uploads.add(data_path) else: found_partitions[t[0]] = 1 + found_partitions.get(t[0], 0) # We still need to create a success file for the data # team for this table, even if something else is AWOL # later in the backup. s_key = bucket.get_key(success_path) if s_key: if found_partitions.get(t[0], 0) < expected_partitions[t[0]]: print( 'Success key {b}/{k} exists but it should ' 'not - deleting it!').format(b=bucket_name, k=success_path) s_key.delete() elif found_partitions.get(t[0], 0) == expected_partitions[t[0]]: print 'Success key {b}/{k} exists!'.format(b=bucket_name, k=success_path) elif found_partitions.get(t[0], 0) == expected_partitions[t[0]]: print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print 'Missing uploads: {}'.format(missing_uploads) else: print 'Missing {} uploads'.format(len(missing_uploads)) return_status = False if return_status: log_csv_backup_success(instance, date, dev_bucket) return return_status
def verify_csv_instance_backup(instance, date, dev_bucket=False): """ Verify that an instance has been backed up to hive Args: instance - The instance to inspect for backups being done date - The date to search for dev_bucket - Check the dev bucket? Returns: True for no problems found, False otherwise. """ return_status = True missing_uploads = set() if csv_backup_success_logged(instance, date, dev_bucket): print('Per csv backup success log, backup has already been ' 'verified') return True if early_verification(date, instance): return True # We might be looking at an instance that is part of a sharded system; if # so we will only look at what DBs are supposed to exist on the instance # otherwise, we will check all DBs. Note, we only set the success flag # for unsharded systems. zk = host_utils.MysqlZookeeper() replica_set = zk.get_replica_set_from_instance(instance) shards = zk.get_shards_by_replica_set()[replica_set] if shards and not is_sharded_but_not_sharded(replica_set): instance_shard_type_mapping = dict() missing_uploads = set() for shard in shards: (s, ns, _) = environment_specific.deconstruct_shard_name(shard) shard_type = ''.join([s, ns]) if shard_type not in instance_shard_type_mapping: instance_shard_type_mapping[shard_type] = set() instance_shard_type_mapping[shard_type].add(shard) for shard_type in instance_shard_type_mapping: example_shard = list(instance_shard_type_mapping[shard_type])[0] (_, db) = zk.map_shard_to_replica_and_db(example_shard) table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(instance, db, True) ] table_tuples = backup.filter_tables_to_csv_backup( instance, table_list) missing_uploads.update( verify_sharded_csv_backup_by_shards( instance_shard_type_mapping[shard_type], table_tuples, date, dev_bucket)[1]) if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print('Instance {} is missing uploads:'.format(instance)) pprint.pprint(missing_uploads) else: print('Instance {instance} is missing {num} uploads' ''.format(num=len(missing_uploads), instance=instance)) return_status = False else: return_status = verify_unsharded_csv_backups(instance, date, dev_bucket) if return_status: print('Instance {} is backed up'.format(instance)) return return_status
def verify_sharded_csv_backup_by_shard_type(shard_type, date, dev_bucket=False): """ Verify that a sharded data set has been backed up to hive Args: shard_type - i.e. 'sharddb', etc date - The date to search for dev_bucket - Look in the dev bucket Returns: True for no problems found, False otherwise. """ zk = host_utils.MysqlZookeeper() (replica_set, db) = zk.get_example_db_and_replica_set_for_shard_type(shard_type) schema_host = zk.get_mysql_instance_from_replica_set( replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE) table_list = [ '{}.{}'.format(db, x) for x in mysql_lib.get_tables(schema_host, db, True) ] table_tuples = backup.filter_tables_to_csv_backup(schema_host, table_list) if not table_tuples: raise Exception('No tables will be checked for backups') verify_csv_schema_upload(schema_host, db, [x[0].split('.')[1] for x in table_tuples], date, dev_bucket) shards = zk.get_shards_by_shard_type(shard_type) if not shards: raise Exception('No shards will be checked for backups') (finished_uploads, missing_uploads) = verify_sharded_csv_backup_by_shards( shards, table_tuples, date, dev_bucket) if finished_uploads: boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV \ if dev_bucket else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) for tbl in finished_uploads: (_, _, success_path) = backup.get_csv_backup_paths( schema_host, db, tbl, date) if not bucket.get_key(success_path): print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') if missing_uploads: if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT: print('Shard type {} is missing uploads:'.format(shard_type)) pprint.pprint(missing_uploads) else: print('Shard type {shard_type} is missing {num} uploads' ''.format(num=len(missing_uploads), shard_type=shard_type)) return False else: # we have checked all shards, all are good, create success files # that might not already be present. theoretically, everything here # will get picked up by the finished_uploads stanza earlier, but # we have this here as a failsafe. boto_conn = boto.connect_s3() bucket_name = environment_specific.S3_CSV_BUCKET_DEV \ if dev_bucket else environment_specific.S3_CSV_BUCKET bucket = boto_conn.get_bucket(bucket_name, validate=False) for t in table_tuples: (_, _, success_path) = backup.get_csv_backup_paths(schema_host, *t[0].split('.'), date=date, partition_number=t[2]) if not bucket.get_key(success_path): print 'Creating success key {b}/{k}'.format(b=bucket_name, k=success_path) key = bucket.new_key(success_path) key.set_contents_from_string(' ') print 'Shard type {} is backed up'.format(shard_type) return True