def verify_unsharded_csv_backup(shard_type, date, instance):
    """ Verify that a non-sharded db has been backed up to hive

    Args:
    shard_type - In this case, a hostname prefix
    date - The date to search for
    instance - The actual instance to inspect for backups being done

    Returns True for no problems found, False otherwise.
    """
    if (date == (datetime.datetime.utcnow().date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")):
        if datetime.datetime.utcnow().time() < CSV_STARTUP:
            print 'Backup startup time has not yet passed'
            # For todays date, we give CSV_STARTUP minutes before checking anything.
            return True

        if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME:
            # For todays date, until after CSV_COMPLETION_TIME it is good enough
            # to check if backups are running. If they are running, everything
            # is ok. If they are not running, we will do all the normal checks.
            if csv_backups_running(instance):
                print 'Backup running on {i}'.format(i=instance)
                return True

    return_status = True
    for db in mysql_lib.get_dbs(instance):
        (success, _) = \
            verify_csv_schema_upload(shard_type, date, instance, db,
                                     mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA)
        if not success:
            return_status = False

    if not return_status:
        print 'missing schema file'
        # problem with schema, don't bother verifying data
        return return_status

    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False)
    missing_uploads = set()
    for db in mysql_lib.get_dbs(instance):
        for table in mysql_lib.get_tables(instance, db, skip_views=True):
            key = mysql_backup_csv.PATH_DAILY_BACKUP.format(table=table,
                                                            hostname_prefix=shard_type,
                                                            date=date,
                                                            db_name=db)
            if not bucket.get_key(key):
                missing_uploads.add(key)

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print 'Missing uploads: {uploads}'.format(uploads=missing_uploads)
        else:
            print 'Missing {num} uploads'.format(num=len(missing_uploads))
    else:
        return True
def verify_unsharded_csv_backup(shard_type, date, instance):
    """ Verify that a non-sharded db has been backed up to hive

    Args:
    shard_type - In this case, a hostname prefix
    date - The date to search for
    instance - The actual instance to inspect for backups being done

    Returns True for no problems found, False otherwise.
    """
    if (date == (datetime.datetime.utcnow().date() -
                 datetime.timedelta(days=1)).strftime("%Y-%m-%d")):
        if datetime.datetime.utcnow().time() < CSV_STARTUP:
            print 'Backup startup time has not yet passed'
            # For todays date, we give CSV_STARTUP minutes before checking anything.
            return True

        if datetime.datetime.utcnow().time() < CSV_COMPLETION_TIME:
            # For todays date, until after CSV_COMPLETION_TIME it is good enough
            # to check if backups are running. If they are running, everything
            # is ok. If they are not running, we will do all the normal checks.
            if csv_backups_running(instance):
                print 'Backup running on {i}'.format(i=instance)
                return True

    return_status = True
    for db in mysql_lib.get_dbs(instance):
        (success, _) = \
            verify_csv_schema_upload(shard_type, date, instance, db,
                                     mysql_backup_csv.PATH_DAILY_BACKUP_NONSHARDED_SCHEMA)
        if not success:
            return_status = False

    if not return_status:
        print 'missing schema file'
        # problem with schema, don't bother verifying data
        return return_status

    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET,
                                  validate=False)
    missing_uploads = set()
    for db in mysql_lib.get_dbs(instance):
        for table in mysql_lib.get_tables(instance, db, skip_views=True):
            key = mysql_backup_csv.PATH_DAILY_BACKUP.format(
                table=table, hostname_prefix=shard_type, date=date, db_name=db)
            if not bucket.get_key(key):
                missing_uploads.add(key)

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print 'Missing uploads: {uploads}'.format(uploads=missing_uploads)
        else:
            print 'Missing {num} uploads'.format(num=len(missing_uploads))
    else:
        return True
    def __init__(self,
                 instance,
                 db=None,
                 force_table=None,
                 force_reupload=False):
        """ Init function for backup, takes all args

        Args:
        instance - A hostAddr obect of the instance to be baced up
        db - (option) backup only specified db
        force_table - (option) backup only specified table
        force_reupload - (optional) force reupload of backup
        """
        self.instance = instance
        self.timestamp = datetime.datetime.utcnow()
        # datestamp is for s3 files which are by convention -1 day
        self.datestamp = (self.timestamp -
                          datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        self.dbs_to_backup = multiprocessing.Queue()
        if db:
            self.dbs_to_backup.put(db)
        else:
            for db in mysql_lib.get_dbs(self.instance):
                self.dbs_to_backup.put(db)

        self.force_table = force_table
        self.force_reupload = force_reupload
Example #4
0
def collectTableStats(db):
    """ Collect table stats

    Args:
    db - a db object
    """
    # First we are going to pull stats aggregated by schema
    # and namespace, if applicable
    global collection_time, last_collection_time
    instance = host_utils.HostAddr(':'.join((socket.gethostname(),
                                             db.port)))
    namespace_dbs_map = dict()
    non_namespace_dbs = set()
    for schema in mysql_lib.get_dbs(instance):
        namespace = get_namespace_from_schema(schema)
        if namespace:
            if namespace not in namespace_dbs_map:
                namespace_dbs_map[namespace] = set()
            namespace_dbs_map[namespace].add(schema)
        else:
            non_namespace_dbs.add(schema)
    for namespace in namespace_dbs_map:
        for row in get_tablestats(db, namespace_dbs_map[namespace]):
            printmetrics_tablestat(db, row, namespace)
    if non_namespace_dbs:
        for row in get_tablestats(db, non_namespace_dbs):
            printmetrics_tablestat(db, row)
    # next we want table stats aggregated by table and namespace.
    for namespace in namespace_dbs_map:
        for row in get_schemastats(db, namespace_dbs_map[namespace]):
            printmetrics_schemastats(db, row, namespace)
    if non_namespace_dbs:
        for row in get_schemastats(db, non_namespace_dbs):
            printmetrics_schemastats(db, row)
    db.query("FLUSH NO_WRITE_TO_BINLOG TABLE_STATISTICS")
Example #5
0
def collectTableStats(db):
    """ Collect table stats

    Args:
    db - a db object
    """
    # First we are going to pull stats aggregated by schema
    # and namespace, if applicable
    global collection_time, last_collection_time
    instance = host_utils.HostAddr(':'.join((socket.gethostname(),
                                             db.port)))
    namespace_dbs_map = dict()
    non_namespace_dbs = set()
    for schema in mysql_lib.get_dbs(instance):
        namespace = get_namespace_from_schema(schema)
        if namespace:
            if namespace not in namespace_dbs_map:
                namespace_dbs_map[namespace] = set()
            namespace_dbs_map[namespace].add(schema)
        else:
            non_namespace_dbs.add(schema)
    for namespace in namespace_dbs_map:
        for row in get_tablestats(db, namespace_dbs_map[namespace]):
            printmetrics_tablestat(db, row, namespace)
    if non_namespace_dbs:
        for row in get_tablestats(db, non_namespace_dbs):
            printmetrics_tablestat(db, row)
    # next we want table stats aggregated by table and namespace.
    for namespace in namespace_dbs_map:
        for row in get_schemastats(db, namespace_dbs_map[namespace]):
            printmetrics_schemastats(db, row, namespace)
    if non_namespace_dbs:
        for row in get_schemastats(db, non_namespace_dbs):
            printmetrics_schemastats(db, row)
    db.query("FLUSH NO_WRITE_TO_BINLOG TABLE_STATISTICS")
    def __init__(self, instance,
                 db=None, force_table=None,
                 force_reupload=False, dev_bucket=False):
        """ Init function for backup, takes all args

        Args:
        instance - A hostAddr obect of the instance to be baced up
        db - (option) backup only specified db
        force_table - (option) backup only specified table
        force_reupload - (optional) force reupload of backup
        """
        self.instance = instance
        self.timestamp = datetime.datetime.utcnow()
        # datestamp is for s3 files which are by convention -1 day
        self.datestamp = (self.timestamp - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        self.dbs_to_backup = multiprocessing.Queue()
        if db:
            self.dbs_to_backup.put(db)
        else:
            for db in mysql_lib.get_dbs(self.instance):
                self.dbs_to_backup.put(db)

        self.force_table = force_table
        self.force_reupload = force_reupload
        if dev_bucket:
            self.upload_bucket = environment_specific.S3_CSV_BUCKET_DEV
        else:
            self.upload_bucket = environment_specific.S3_CSV_BUCKET
Example #7
0
def check_instance_table(hostaddr, table, desired_hash):
    """ Check that a table on a MySQL instance has the expected schema

    Args:
    hostaddr - object describing which mysql instance to connect to
    table - the name of the table to verify
    desired_hash - the md5sum of the desired CREATE TABLE for the table

    Returns:
    A dictionary with keys that are the hash of the CREATE TABLE statement
    and the values are sets of hostname:port followed by a space and then the
    db one which the incorrect schema was found.
    """
    ret = dict()
    conn = mysql_lib.connect_mysql(hostaddr)
    for db in mysql_lib.get_dbs(conn):
        definition = mysql_lib.show_create_table(conn, db, table)
        tbl_hash = hashlib.md5(definition).hexdigest()
        if tbl_hash != desired_hash:
            if tbl_hash not in ret:
                ret[tbl_hash] = set()
            ret[tbl_hash].add(''.join((hostaddr.__str__(),
                                       ' ',
                                       db)))
    return ret
Example #8
0
def get_problem_replicasets(localpath, raw_mapping):
    problem_replica_sets = dict()
    zk = host_utils.MysqlZookeeper()
    replica_set_dbs = get_db_on_replica_set(localpath, raw_mapping)
    for replica_set in replica_set_dbs:
        master = zk.get_mysql_instance_from_replica_set(
            replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_MASTER)
        dbs = mysql_lib.get_dbs(master)
        missing = replica_set_dbs[replica_set].difference(dbs)
        if missing:
            problem_replica_sets[replica_set] = missing
    return problem_replica_sets
def verify_unsharded_csv_backup(shard_type, date, instance):
    """ Verify that a non-sharded db has been backed up to hive

    Args:
    shard_type - In this case, a hostname prefix
    date - The date to search for
    instance - The actual instance to inspect for backups being done

    Returns True for no problems found, False otherwise.
    """
    return_status = True
    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET,
                                  validate=False)
    missing_uploads = set()
    for db in mysql_lib.get_dbs(instance):
        tables = mysql_backup_csv.mysql_backup_csv(
            instance).get_tables_to_backup(db)
        for table in tables:
            if not verify_csv_schema_upload(shard_type, date, instance, db,
                                            set([table])):
                return_status = False
                print 'Missing schema for {db}.{table}'.format(db=db,
                                                               table=table)
                continue

            (_, data_path, success_path) = \
                environment_specific.get_csv_backup_paths(date, db, table,
                                                          instance.replica_type,
                                                          instance.get_zk_replica_set()[0])
            if not bucket.get_key(data_path):
                missing_uploads.add(data_path)
            else:
                # we still need to create a success file for the data
                # team for this table, even if something else is AWOL
                # later in the backup.
                if bucket.get_key(success_path):
                    print 'Key already exists {key}'.format(key=success_path)
                else:
                    print 'Creating success key {key}'.format(key=success_path)
                    key = bucket.new_key(success_path)
                    key.set_contents_from_string('')

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print 'Missing uploads: {uploads}'.format(uploads=missing_uploads)
        else:
            print 'Missing {num} uploads'.format(num=len(missing_uploads))
        return_status = False

    return return_status
Example #10
0
def find_shard_mismatches(instance=False):
    """ Find shards that are missing or unexpected in a sharded dataset

    Args:
        instance - If supplied, only check this instance.

    Returns:
        orphaned - A dict of unexpected and (according to table statistics)
                   unused dbs. Key is master instance, value is a set.
        orphaned_but_used - A dict of unexpected and but used dbs.
                            Data structure is the same as orphaned.
        missing - A dict of expected but missing dbs.
                  Data structure is the same as orphaned.
    """
    orphaned = dict()
    orphaned_but_used = dict()
    missing_dbs = dict()

    zk = host_utils.MysqlZookeeper()
    rs_dbs_map = zk.get_sharded_dbs_by_replica_set()

    if instance:
        rs = zk.get_replica_set_from_instance(instance)
        rs_dbs_map = {rs: rs_dbs_map[rs]}

    for rs in rs_dbs_map:
        # non-sharded replica sets
        if not len(rs_dbs_map[rs]):
            continue

        expected_dbs = rs_dbs_map[rs]
        instance = zk.get_mysql_instance_from_replica_set(rs)

        activity = mysql_lib.get_dbs_activity(instance)
        actual_dbs = mysql_lib.get_dbs(instance)
        unexpected_dbs = actual_dbs.difference(expected_dbs)
        missing = expected_dbs.difference(actual_dbs)
        if missing:
            missing_dbs[instance] = expected_dbs.difference(actual_dbs)

        for db in unexpected_dbs:
            if activity[db]['ROWS_CHANGED'] != 0:
                if instance not in orphaned_but_used:
                    orphaned_but_used[instance] = set()
                orphaned_but_used[instance].add(db)
            else:
                if instance not in orphaned:
                    orphaned[instance] = set()
                orphaned[instance].add(db)

    return orphaned, orphaned_but_used, missing_dbs
def find_shard_mismatches(instance=False):
    """ Find shards that are missing or unexpected in modhsarddb and sharddb

    Args:
    instance - If supplied, only check this instance.

    Returns:
    orphaned - A dict of unexpected and (according to table statistics)
               unused shards. Key is master instance, value is a set.
    orphaned_but_used - A dict of unexpected and but used shards.
                        Data strucutre is the same as orphaned.
    missing - A dict of expected but missing shards.
              Data strucutre is the same as orphaned.

    """
    orphaned = dict()
    orphaned_but_used = dict()
    missing_shards = dict()

    zk = host_utils.MysqlZookeeper()
    host_shard_map = zk.get_host_shard_map()

    if instance:
        new_host_shard_map = dict()
        new_host_shard_map[instance.__str__()] = host_shard_map[
            instance.__str__()]
        host_shard_map = new_host_shard_map

    for master in host_shard_map:
        expected_shards = host_shard_map[master]
        instance = host_utils.HostAddr(master)
        conn = mysql_lib.connect_mysql(instance)
        activity = mysql_lib.get_dbs_activity(conn)
        actual_shards = mysql_lib.get_dbs(conn)
        unexpected_shards = actual_shards.difference(expected_shards)
        missing = expected_shards.difference(actual_shards)
        if missing:
            missing_shards[master] = expected_shards.difference(actual_shards)

        for db in unexpected_shards:
            if activity[db]['ROWS_CHANGED'] != 0:
                if master not in orphaned_but_used:
                    orphaned_but_used[master] = set()
                orphaned_but_used[master].add(db)
            else:
                if master not in orphaned:
                    orphaned[master] = set()
                orphaned[master].add(db)

    return orphaned, orphaned_but_used, missing_shards
def verify_unsharded_csv_backup(shard_type, date, instance):
    """ Verify that a non-sharded db has been backed up to hive

    Args:
    shard_type - In this case, a hostname prefix
    date - The date to search for
    instance - The actual instance to inspect for backups being done

    Returns True for no problems found, False otherwise.
    """
    return_status = True
    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False)
    missing_uploads = set()
    for db in mysql_lib.get_dbs(instance):
        tables = mysql_backup_csv.mysql_backup_csv(instance).get_tables_to_backup(db)
        for table in tables:
            if not verify_csv_schema_upload(shard_type, date, instance, db,
                                            set([table])):
                return_status = False
                print 'Missing schema for {db}.{table}'.format(db=db,
                                                               table=table)
                continue

            (_, data_path, success_path) = \
                environment_specific.get_csv_backup_paths(date, db, table,
                                                          instance.replica_type,
                                                          instance.get_zk_replica_set()[0])
            if not bucket.get_key(data_path):
                missing_uploads.add(data_path)
            else:
                # we still need to create a success file for the data
                # team for this table, even if something else is AWOL
                # later in the backup.
                if bucket.get_key(success_path):
                    print 'Key already exists {key}'.format(key=success_path)
                else:
                    print 'Creating success key {key}'.format(key=success_path)
                    key = bucket.new_key(success_path)
                    key.set_contents_from_string('')

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print 'Missing uploads: {uploads}'.format(uploads=missing_uploads)
        else:
            print 'Missing {num} uploads'.format(num=len(missing_uploads))
        return_status = False

    return return_status
def find_shard_mismatches(instance=False):
    """ Find shards that are missing or unexpected in modhsarddb and sharddb

    Args:
    instance - If supplied, only check this instance.

    Returns:
    orphaned - A dict of unexpected and (according to table statistics)
               unused shards. Key is master instance, value is a set.
    orphaned_but_used - A dict of unexpected and but used shards.
                        Data strucutre is the same as orphaned.
    missing - A dict of expected but missing shards.
              Data strucutre is the same as orphaned.

    """
    orphaned = dict()
    orphaned_but_used = dict()
    missing_shards = dict()

    zk = host_utils.MysqlZookeeper()
    host_shard_map = zk.get_host_shard_map()

    if instance:
        new_host_shard_map = dict()
        new_host_shard_map[instance.__str__()] = host_shard_map[instance.__str__()]
        host_shard_map = new_host_shard_map

    for master in host_shard_map:
        expected_shards = host_shard_map[master]
        instance = host_utils.HostAddr(master)
        conn = mysql_lib.connect_mysql(instance)
        activity = mysql_lib.get_dbs_activity(conn)
        actual_shards = mysql_lib.get_dbs(conn)
        unexpected_shards = actual_shards.difference(expected_shards)
        missing = expected_shards.difference(actual_shards)
        if missing:
            missing_shards[master] = expected_shards.difference(actual_shards)

        for db in unexpected_shards:
            if activity[db]['ROWS_CHANGED'] != 0:
                if master not in orphaned_but_used:
                    orphaned_but_used[master] = set()
                orphaned_but_used[master].add(db)
            else:
                if master not in orphaned:
                    orphaned[master] = set()
                orphaned[master].add(db)

    return orphaned, orphaned_but_used, missing_shards
def get_all_table_sizes(instance):
    """ Get size of all innodb tables
        NOTE: At this point tables should always be innodb
        NOTE2: file per table should always be on.

    Args:
    instance - A hostAddr object
    """
    datadir = host_utils.get_cnf_setting('datadir', instance.port)
    ret = dict()
    for db in mysql_lib.get_dbs(instance):
        ret[db] = dict()
        db_dir = os.path.join(datadir, db)
        for table_path in glob.glob(''.join([db_dir, '/*', INNODB_EXTENSION])):
            (table, partition) = parse_table_file_name(table_path)
            if table not in ret[db]:
                ret[db][table] = dict()
            ret[db][table][partition] = os.stat(table_path).st_size/1024/1024

    return ret
Example #15
0
def drop_db_after_rename(instance, dbs=None, dry_run=False):
    """ Drop the original empty db and a non-empty rename db

    Args:
        instance - a hostaddr object
        dbs -  a set of database names
        dry_run - bool, will make no changes to the servers
    """
    if not dbs:
        dbs = set()
        for db in mysql_lib.get_dbs(instance):
            if db.startswith(DB_PREPEND):
                dbs.add(db[len(DB_PREPEND):])

    # confirm db is not in zk and not in use
    orphaned, _, _ = find_shard_mismatches.find_shard_mismatches(instance)
    instance_orphans = orphaned[instance]
    unexpected = dbs.difference(instance_orphans)
    if unexpected:
        raise Exception('Cowardly refusing to act on the following '
                        'dbs: {}'.format(unexpected))

    # make sure the original db is empty
    for db in dbs:
        if mysql_lib.get_tables(instance, db):
            raise Exception('Cowardly refusing to drop non-empty '
                            'db: {}'.format(db))

    for db in dbs:
        renamed_db = ''.join((DB_PREPEND, db))
        if dry_run:
            log.info('dry_run is enabled, not dropping '
                     'dbs: {db} {renamed}'.format(db=db, renamed=renamed_db))
        else:
            mysql_lib.drop_db(instance, db)
            mysql_lib.drop_db(instance, renamed_db)
Example #16
0
def main():
    description = ("MySQL checksum wrapper\n\n"
                   "Wrapper of pt-table-checksum and pt-table-sync.\n"
                   "Defaults to checksumming 1/{k}th of databases on instance.\n"
                   "If diffs are found, use pt-table-sync to measure actual "
                   "divergence,\nbut only if the number of diffs is between "
                   "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION)

    parser = argparse.ArgumentParser(description=description,
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-i',
                        '--instance',
                        help='Instance to act on if other than localhost:3306',
                        default=''.join((socket.getfqdn(),
                                         ':3306')))
    parser.add_argument('-a',
                        '--all',
                        help='Checksums all dbs rather than the default',
                        action='store_true',
                        default=False)
    parser.add_argument('-d',
                        '--dbs',
                        help=("Comma separated list of db's to check rather "
                              "than the default"),
                        default=False)
    parser.add_argument('-q',
                        '--quiet',
                        help=("Do not print output to stdout"),
                        action='store_true',
                        default=False)
    parser.add_argument('-m',
                        '--min_diffs',
                        help=("Do per-row check if chunk diff count is at "
                              "least this value"),
                        dest='min_diffs',
                        default=MIN_DIFFS)
    parser.add_argument('-M',
                        '--max_diffs',
                        help=("Do not do per-row check if chunk diff count "
                              "is greater than this value"),
                        dest='max_diffs',
                        default=MAX_DIFFS)
    parser.add_argument('-C',
                        '--no_create_table',
                        help=("If test.checksum_detail is missing, do "
                              "not try to create it."),
                        dest='create_table',
                        action='store_false',
                        default=True)
    parser.add_argument('-v',
                        '--verbose',
                        help=("Store raw output from PT tools in the DB?"),
                        action='store_true',
                        default=False)
    parser.add_argument('-c',
                        '--check_fraction',
                        help=('Check this fraction of databases.'),
                        default=DB_CHECK_FRACTION)

    args = parser.parse_args()
    instance = host_utils.HostAddr(args.instance)
    zk = host_utils.MysqlZookeeper()

    if instance not in \
            zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER):
        raise Exception("Instance is not a master in ZK")

    # If enabled, try to create the table that holds the checksum info.
    # If not enabled, make sure that the table exists.
    conn = mysql_lib.connect_mysql(instance, 'scriptro')
    if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL):
        if args.create_table:
            create_checksum_detail_table(instance)
        else:
            raise Exception("Checksum table not found.  Unable to continue."
                            "Consider not using the -C option or create it "
                            "yourself.")

    # Determine what replica set we belong to and get a list of slaves.
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    slaves = set()
    for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE:
        s = zk.get_mysql_instance_from_replica_set(replica_set, rtype)
        if s:
            slaves.add(s)

    if len(slaves) == 0:
        log.info("This server has no slaves.  Nothing to do.")
        sys.exit(0)

    # before we even start this, make sure replication is OK.
    for slave in slaves:
        slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw')
        ss = mysql_lib.get_slave_status(slave_conn)
        if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes":
            raise Exception("Replication is NOT RUNNING on slave {s}: "
                            "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'],
                                                          it=ss['Slave_IO_Running']))

    if args.dbs:
        db_to_check = set(args.dbs.split(','))
    else:
        dbs = mysql_lib.get_dbs(conn)

        if args.all:
            db_to_check = dbs
        else:
            # default behaviour, check a given DB every N days based on
            # day of year.  minimizes month-boundary issues.
            db_to_check = set()
            check_modulus = int(time.strftime("%j")) % int(args.check_fraction)
            counter = 0
            for db in dbs:
                modulus = counter % int(args.check_fraction)
                if modulus == check_modulus:
                    db_to_check.add(db)
                counter = counter + 1

    # Iterate through the list of DBs and check one table at a time.
    # We do it this way to ensure more coverage in case pt-table-checksum
    # loses its DB connection and errors out before completing a full scan
    # of a given database.
    #
    for db in db_to_check:
        conn = mysql_lib.connect_mysql(instance, 'scriptro')
        tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True)
        for tbl in tables_to_check:
            c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl)
            if not args.quiet:
                log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd))
                log.info("Standard out:\n{out}".format(out=c_out))
                log.info("Standard error:\n{err}".format(err=c_err))
                log.info("Return code: {ret}".format(ret=c_ret))

            # parse each line of STDOUT (there should only be one with
            # actual data).  We only care about errors, rows, chunks, and
            # skipped, since we'll need to figure out diffs separately for
            # each slave box.
            for line in c_out.split("\n"):
                results = parse_checksum_row(line)
                if results:
                    chunk_errors = int(results[1])
                    row_count = int(results[3])
                    chunk_count = int(results[4])
                    chunk_skips = int(results[5])

                    for slave in slaves:
                        rows_checked = 'NO'
                        sync_cmd = ""
                        sync_out = ""
                        sync_err = ""
                        sync_ret = -1
                        row_diffs = 0

                        elapsed_time_ms,\
                            chunk_diffs = check_one_replica(slave,
                                                            db, tbl)

                        # if we skipped some chunks or there were errors,
                        # this means we can't have complete information about the
                        # state of the replica. in the case of a hard error,
                        # we'll just stop.  in the case of a skipped chunk, we will
                        # treat it as a different chunk for purposes of deciding
                        # whether or not to do a more detailed analysis.
                        #
                        checkable_chunks = chunk_skips + chunk_diffs

                        if chunk_errors > 0:
                            checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS'
                        elif checkable_chunks == 0:
                            checksum_status = 'GOOD'
                        else:
                            if checkable_chunks > int(args.max_diffs):
                                # too many chunk diffs, don't bother checking
                                # further.  not good.
                                checksum_status = 'TOO_MANY_CHUNK_DIFFS'
                            elif checkable_chunks < int(args.min_diffs):
                                # some diffs, but not enough that we care.
                                checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK'
                            else:
                                start_time = int(time.time()*1000)
                                rows_checked = 'YES'

                                # set the proper status - did we do a sync-based check
                                # because of explicit diffs or because of skipped chunks?
                                if chunk_diffs > 0:
                                    checksum_status = 'ROW_DIFFS_FOUND'
                                else:
                                    checksum_status = 'CHUNKS_WERE_SKIPPED'

                                sync_cmd, sync_out, sync_err, sync_ret, \
                                    row_diffs = checksum_tbl_via_sync(slave,
                                                                      db,
                                                                      tbl)

                                # Add in the time it took to do the sync.
                                elapsed_time_ms += int(time.time()*1000) - start_time

                                if not args.quiet:
                                    log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd))
                                    log.info("Standard out:\n {out}".format(out=sync_out))
                                    log.info("Standard error:\n {err}".format(err=sync_err))
                                    log.info("Return code: {ret}".format(ret=sync_ret))
                                    log.info("Row diffs found: {cnt}".format(cnt=row_diffs))

                        # Checksum process is complete, store the results.
                        #
                        data = {'instance': slave,
                                'master_instance': instance,
                                'db': db,
                                'tbl': tbl,
                                'elapsed_time_ms': elapsed_time_ms,
                                'chunk_count': chunk_count,
                                'chunk_errors': chunk_errors,
                                'chunk_diffs': chunk_diffs,
                                'chunk_skips': chunk_skips,
                                'row_count': row_count,
                                'row_diffs': row_diffs,
                                'rows_checked': rows_checked,
                                'checksum_status': checksum_status,
                                'checksum_cmd': None,
                                'checksum_stdout': None,
                                'checksum_stderr': None,
                                'checksum_rc': c_ret,
                                'sync_cmd': None,
                                'sync_stdout': None,
                                'sync_stderr': None,
                                'sync_rc': sync_ret}

                        if args.verbose:
                            data.update({'checksum_cmd': c_cmd,
                                         'checksum_stdout': c_out,
                                         'checksum_stderr': c_err,
                                         'sync_cmd': sync_cmd,
                                         'sync_stdout': sync_out,
                                         'sync_stderr': sync_err,
                                         'sync_rc': sync_ret})

                        write_checksum_status(instance, data)

        conn.close()
def verify_flexsharded_csv_backup(shard_type, date, instance=None):
    """ Verify that a flexsharded data set has been backed up to hive

    Args:
    shard_type -  i.e. 'commercefeeddb', etc
    date - The date to search for
    instance - Restrict the search to problem on a single instnace

    Returns True for no problems found, False otherwise.
    """
    success = True
    replica_sets = set()
    zk = host_utils.MysqlZookeeper()
    if instance:
        replica_sets.add(zk.get_replica_set_from_instance(instance)[0])
    else:
        for replica_set in zk.get_all_mysql_replica_sets():
            if replica_set.startswith(
                    environment_specific.FLEXSHARD_DBS[shard_type]
                ['zk_prefix']):
                replica_sets.add(replica_set)

    schema_host = zk.get_mysql_instance_from_replica_set(
        environment_specific.FLEXSHARD_DBS[shard_type]
        ['example_shard_replica_set'],
        repl_type=host_utils.REPLICA_ROLE_SLAVE)

    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET,
                                  validate=False)
    missing_uploads = set()

    for db in mysql_lib.get_dbs(schema_host):
        for table in mysql_backup_csv.mysql_backup_csv(
                schema_host).get_tables_to_backup(db):
            if not verify_csv_schema_upload(shard_type, date, schema_host, db,
                                            [table]):
                success = False
                continue

            table_missing_uploads = set()
            for replica_set in replica_sets:
                chk_instance = zk.get_mysql_instance_from_replica_set(
                    replica_set)
                (_, data_path,
                 success_path) = environment_specific.get_csv_backup_paths(
                     date, db, table, chk_instance.replica_type,
                     chk_instance.get_zk_replica_set()[0])
                if not bucket.get_key(data_path):
                    table_missing_uploads.add(data_path)
                    success = False

            if not table_missing_uploads and not instance:
                if not bucket.get_key(success_path):
                    print 'Creating success key {key}'.format(key=success_path)
                    key = bucket.new_key(success_path)
                    key.set_contents_from_string('')

            missing_uploads.update(table_missing_uploads)

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print('Shard type {shard_type} is missing uploads:'
                  ''.format(shard_type=shard_type))
            pprint.pprint(missing_uploads)
        else:
            print('Shard type {shard_type} is missing {num} uploads'
                  ''.format(num=len(missing_uploads), shard_type=shard_type))

    if not missing_uploads and not instance and success:
        print 'Shard type {shard_type} is backed up'.format(
            shard_type=shard_type)

    return success
def start_shard_migration(source_replica_set, destination_replica_set,
                          mig_dbs):
    """ Move shards from one replica set to another

    Args:
    source_replica_set - Which replica set to take the shards from
    destination_replica_set - Which replica set to put the shards on
    mig_dbs - A set of databases to be migrated
    """
    # In 2017Q1 shardb and modsharddb will learn how to deal with shard
    # migrations. We will block them for now.
    if source_replica_set.startswith('db') or \
            source_replica_set.startswith('moddb'):
        raise Exception('Sharddb and modsharddb migrations are not yet '
                        'supported')

    if source_replica_set == destination_replica_set:
        raise Exception('Source and destination can not be the same!')
    # Dealing with failures, potentially due to failovers seems scary
    # here. We are intentionally not catching exception as this seems racy
    # and it would be far better for the entire process to fail than to mess
    # with replication during a failover.
    log.info('Requested to migrate from {s} to {d} databases: {db}'
             ''.format(s=source_replica_set,
                       d=destination_replica_set,
                       db=', '.join(mig_dbs)))

    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    source_slave = zk.get_mysql_instance_from_replica_set(
        source_replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)

    if not source_slave:
        source_slave = zk.get_mysql_instance_from_replica_set(
            source_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    log.info('Source host for dumping data {}'.format(source_slave))
    destination_master = zk.get_mysql_instance_from_replica_set(
            destination_replica_set)
    log.info('Destination host for restoring data {}'
             ''.format(destination_master))

    expected_dbs_on_source = zk.get_sharded_dbs_by_replica_set()[source_replica_set]
    non_mig_dbs = mysql_lib.get_dbs(source_slave).difference(mig_dbs)
    unexpected_dbs = mig_dbs.difference(expected_dbs_on_source)
    if unexpected_dbs:
        raise Exception('Unexpected database supplied for migraton: {}'
                        ''.format(unexpected_dbs))

    # Make sure there are no missing or extra shards
    precheck_schema(source_master)
    precheck_schema(destination_master)

    # Check disk space
    required_disk_space = get_required_disk_space(mig_dbs, source_master)
    available_disk_space = disk_space_available_for_migration(destination_master)
    if available_disk_space < required_disk_space:
        raise Exception('Insufficent disk space to migrate, '
                        'available {a}MB, '
                        'requred {r}MB'
                        ''.format(a=available_disk_space,
                                  r=required_disk_space))
    else:
        log.info('Disk space looks ok: '
                 'available {a}MB, '
                 'requred {r}MB'
                 ''.format(a=available_disk_space,
                           r=required_disk_space))

    # Let's take out a lock to make sure we don't have multiple migrations
    # running on the same replica sets (either source or destination).
    lock_id = take_migration_lock(source_replica_set, destination_replica_set,
                                  mig_dbs, non_mig_dbs)
    try:
        if(non_mig_dbs):
            # First we will dump the schema for the shards that are not moving
            log.info('Backing up non-migrating schema: {}'.format(non_mig_dbs))
            no_mig_backup = backup.logical_backup_instance(
                                            source_slave, time.localtime(),
                                            blackhole=True, databases=non_mig_dbs)

        time.sleep(1)
        # And next the metadata db
        log.info('Backing up metadata db: {}'.format(mysql_lib.METADATA_DB))
        metadata_backup = backup.logical_backup_instance(
                                        source_slave, time.localtime(),
                                        databases=[mysql_lib.METADATA_DB])

        time.sleep(1)
        # Next we will backup the data for the shards that are moving
        log.info('Backing up migrating schema data: {}'.format(mig_dbs))
        mig_backup = backup.logical_backup_instance(
                                       source_slave, time.localtime(),
                                       databases=mig_dbs)
    except:
        finish_migration_log(lock_id, STATUS_EXPORT_FAILED)
        raise

    if(non_mig_dbs):
        # Finally import the backups
        log.info('Importing all the blackhole tables')
        mysql_restore.logical_restore(no_mig_backup, destination_master)

    log.info('Import metadata')
    mysql_restore.logical_restore(metadata_backup, destination_master)

    log.info('Setting up replication')
    mysql_lib.change_master(destination_master, source_master,
                            'BOGUS', 0, no_start=True, skip_set_readonly=True,
                            gtid_auto_pos=False)
    mysql_restore.logical_restore(mig_backup, destination_master)

    # add start slave, catchup
    mysql_lib.start_replication(destination_master)
    mysql_lib.wait_for_catch_up(destination_master, migration=True)

    # And update the log/locks
    update_migration_status(lock_id, STATUS_FAILOVER_READY)
    log.info('The migration is ready to be finished by running:')
    log.info('/usr/local/bin/mysql_utils/finish_shard_migration.py {src}'
             ''.format(src=source_replica_set))
Example #19
0
def verify_flexsharded_csv_backup(shard_type, date, dev_bucket=False):
    """ Verify that a flexsharded data set has been backed up to hive

    Args:
        shard_type -  i.e. 'commercefeeddb', etc
        date - The date to search for
        dev_bucket - Look in the dev bucket?

    Returns:
        True for no problems found, False otherwise.
    """
    success = True
    replica_sets = set()
    zk = host_utils.MysqlZookeeper()

    # Figure out what replica sets to check based on a prefix
    for replica_set in zk.get_all_mysql_replica_sets():
        if replica_set.startswith(
                environment_specific.FLEXSHARD_DBS[shard_type]['zk_prefix']):
            replica_sets.add(replica_set)

    # Example schema host
    schema_host = zk.get_mysql_instance_from_replica_set(
        environment_specific.FLEXSHARD_DBS[shard_type]
        ['example_shard_replica_set'],
        repl_type=host_utils.REPLICA_ROLE_SLAVE)

    boto_conn = boto.connect_s3()
    bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \
                    else environment_specific.S3_CSV_BUCKET
    bucket = boto_conn.get_bucket(bucket_name, validate=False)
    missing_uploads = set()

    for db in mysql_lib.get_dbs(schema_host):
        table_list = [
            '{}.{}'.format(db, x)
            for x in mysql_lib.get_tables(schema_host, db, True)
        ]
        table_tuples = backup.filter_tables_to_csv_backup(
            schema_host, table_list)

        for t in table_tuples:
            try:
                verify_csv_schema_upload(schema_host,
                                         db, [t[0].split('.')[1]],
                                         date=date,
                                         dev_bucket=dev_bucket)
            except:
                continue

            table_missing_uploads = set()
            for replica_set in replica_sets:
                chk_instance = zk.get_mysql_instance_from_replica_set(
                    replica_set)
                (_, data_path, success_path) = backup.get_csv_backup_paths(
                    chk_instance,
                    db,
                    t[0].split('.')[1],
                    date=date,
                    partition_number=t[2])

                k = bucket.get_key(data_path)
                if k is None:
                    table_missing_uploads.add(data_path)
                    success = False
                elif k.size == 0:
                    # we should not have zero-length files, because even if
                    # we send zero bytes to lzop, there's a 55-byte header.
                    # so, if this actually happened, it probably means that
                    # something is wrong.  delete the key and add it to the
                    # missing_uploads list so that we'll try again.
                    k.delete()
                    table_missing_uploads.add(data_path)
                    success = False

            if not table_missing_uploads and not bucket.get_key(success_path):
                print 'Creating success key {b}/{k}'.format(b=bucket_name,
                                                            k=success_path)
                key = bucket.new_key(success_path)
                key.set_contents_from_string(' ')

            missing_uploads.update(table_missing_uploads)

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print('Shard type {} is missing uploads:'.format(shard_type))
            pprint.pprint(missing_uploads)
        else:
            print('Shard type {shard_type} is missing {num} uploads'
                  ''.format(num=len(missing_uploads), shard_type=shard_type))

    if not missing_uploads and success:
        print 'Shard type {} is backed up'.format(shard_type)

    return success
Example #20
0
def verify_unsharded_csv_backups(instance, date, dev_bucket=False):
    """ Verify csv backups for an instance which is not part of a sharded
        system

    Args:
        instance - The instance to inspect for backups being done
        date - The date to search for
        dev_bucket - Use the dev bucket?

    Returns:
        True for no problems found, False otherwise.
    """
    return_status = True
    boto_conn = boto.connect_s3()
    bucket_name = environment_specific.S3_CSV_BUCKET_DEV if dev_bucket \
                    else environment_specific.S3_CSV_BUCKET
    bucket = boto_conn.get_bucket(bucket_name, validate=False)
    missing_uploads = set()
    for db in mysql_lib.get_dbs(instance):
        table_list = [
            '{}.{}'.format(db, x)
            for x in mysql_lib.get_tables(instance, db, True)
        ]
        table_tuples = backup.filter_tables_to_csv_backup(instance, table_list)
        try:
            verify_csv_schema_upload(
                instance, db, [x[0].split('.')[1] for x in table_tuples], date,
                dev_bucket)
        except Exception as e:
            print e
            return_status = False
            continue

        table_names = [x[0] for x in table_tuples]
        expected_partitions = dict(
            (x, table_names.count(x)) for x in table_names)
        found_partitions = dict()

        for t in table_tuples:
            (_, data_path, success_path) = \
                backup.get_csv_backup_paths(instance, *t[0].split('.'), date=date,
                                            partition_number=t[2])
            k = bucket.get_key(data_path)
            if k is None:
                missing_uploads.add(data_path)
            elif k.size == 0:
                # we should not have zero-length files, because even if
                # we send zero bytes to lzop, there's a 55-byte header.
                # so, if this actually happened, it probably means that
                # something is wrong.  delete the key and add it to the
                # missing_uploads list so that we'll try again.
                k.delete()
                missing_uploads.add(data_path)
            else:
                found_partitions[t[0]] = 1 + found_partitions.get(t[0], 0)

            # We still need to create a success file for the data
            # team for this table, even if something else is AWOL
            # later in the backup.
            s_key = bucket.get_key(success_path)
            if s_key:
                if found_partitions.get(t[0], 0) < expected_partitions[t[0]]:
                    print(
                        'Success key {b}/{k} exists but it should '
                        'not - deleting it!').format(b=bucket_name,
                                                     k=success_path)
                    s_key.delete()
                elif found_partitions.get(t[0],
                                          0) == expected_partitions[t[0]]:
                    print 'Success key {b}/{k} exists!'.format(b=bucket_name,
                                                               k=success_path)
            elif found_partitions.get(t[0], 0) == expected_partitions[t[0]]:
                print 'Creating success key {b}/{k}'.format(b=bucket_name,
                                                            k=success_path)
                key = bucket.new_key(success_path)
                key.set_contents_from_string(' ')

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print 'Missing uploads: {}'.format(missing_uploads)
        else:
            print 'Missing {} uploads'.format(len(missing_uploads))
        return_status = False

    if return_status:
        log_csv_backup_success(instance, date, dev_bucket)
    return return_status
Example #21
0
    except socket.error, (code, msg):
        log.error("Unable to bind socket for checksum on {rs} "
                  "(msg: {m}, code:{c})".format(rs=replica_set, m=msg, c=code))
        sys.exit(code)

    log.info("Locked replica set {} for checksum on this "
             "server".format(replica_set))

    # before we even start this, make sure replication is OK.
    for slave in slaves:
        mysql_lib.assert_replication_sanity(slave)

    if args.dbs:
        db_to_check = set(args.dbs.split(','))
    else:
        dbs = mysql_lib.get_dbs(instance)

        if args.all:
            db_to_check = dbs
        else:
            # default behaviour, check a given DB every N days based on
            # day of year.  minimizes month-boundary issues.
            db_to_check = set()
            check_modulus = int(time.strftime("%j")) % int(args.check_fraction)
            counter = 0
            for db in dbs:
                modulus = counter % int(args.check_fraction)
                if modulus == check_modulus:
                    db_to_check.add(db)
                counter = counter + 1
def verify_flexsharded_csv_backup(shard_type, date, instance=None):
    """ Verify that a flexsharded data set has been backed up to hive

    Args:
    shard_type -  i.e. 'commercefeeddb', etc
    date - The date to search for
    instance - Restrict the search to problem on a single instnace

    Returns True for no problems found, False otherwise.
    """
    success = True
    replica_sets = set()
    zk = host_utils.MysqlZookeeper()
    if instance:
        replica_sets.add(zk.get_replica_set_from_instance(instance)[0])
    else:
        for replica_set in zk.get_all_mysql_replica_sets():
            if replica_set.startswith(environment_specific.FLEXSHARD_DBS[shard_type]['zk_prefix']):
                replica_sets.add(replica_set)

    schema_host = zk.get_mysql_instance_from_replica_set(
            environment_specific.FLEXSHARD_DBS[shard_type]['example_shard_replica_set'],
            repl_type=host_utils.REPLICA_ROLE_SLAVE)

    boto_conn = boto.connect_s3()
    bucket = boto_conn.get_bucket(environment_specific.S3_CSV_BUCKET, validate=False)
    missing_uploads = set()

    for db in mysql_lib.get_dbs(schema_host):
        for table in mysql_backup_csv.mysql_backup_csv(schema_host).get_tables_to_backup(db):
            if not verify_csv_schema_upload(shard_type, date, schema_host, db, [table]):
                success = False
                continue

            table_missing_uploads = set()
            for replica_set in replica_sets:
                chk_instance = zk.get_mysql_instance_from_replica_set(replica_set)
                (_, data_path, success_path) = environment_specific.get_csv_backup_paths(
                                                   date, db, table, chk_instance.replica_type,
                                                   chk_instance.get_zk_replica_set()[0])
                if not bucket.get_key(data_path):
                    table_missing_uploads.add(data_path)
                    success = False

            if not table_missing_uploads and not instance:
                if not bucket.get_key(success_path):
                    print 'Creating success key {key}'.format(key=success_path)
                    key = bucket.new_key(success_path)
                    key.set_contents_from_string('')

            missing_uploads.update(table_missing_uploads)

    if missing_uploads:
        if len(missing_uploads) < MISSING_BACKUP_VERBOSE_LIMIT:
            print ('Shard type {shard_type} is missing uploads:'
                   ''.format(shard_type=shard_type))
            pprint.pprint(missing_uploads)
        else:
            print ('Shard type {shard_type} is missing {num} uploads'
                   ''.format(num=len(missing_uploads),
                             shard_type=shard_type))

    if not missing_uploads and not instance and success:
        print 'Shard type {shard_type} is backed up'.format(shard_type=shard_type)

    return success