Ejemplo n.º 1
0
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """
        host_lock_handle = None
        try:
            log.info('Backup for instance {i} started at {t}'
                     ''.format(t=str(self.timestamp), i=self.instance))
            log.info('Checking heartbeat to make sure replicaiton is not too '
                     'lagged.')
            self.check_replication_for_backup()

            log.info('Taking host backup lock')
            host_lock_handle = host_utils.take_flock_lock(
                backup.BACKUP_LOCK_FILE)

            log.info('Setting up export directory structure')
            self.setup_and_get_tmp_path()
            log.info('Will temporarily dump inside of {path}'
                     ''.format(path=self.dump_base_path))

            log.info('Releasing any invalid shard backup locks')
            self.ensure_backup_locks_sanity()

            log.info('Deleting old expired locks')
            self.purge_old_expired_locks()

            log.info('Stopping replication SQL thread to get a snapshot')
            mysql_lib.stop_replication(self.instance,
                                       mysql_lib.REPLICATION_THREAD_SQL)

            workers = []
            for _ in range(multiprocessing.cpu_count() / 2):
                proc = multiprocessing.Process(
                    target=self.mysql_backup_csv_dbs)
                proc.daemon = True
                proc.start()
                workers.append(proc)
            # throw in a sleep to make sure all threads have started dumps
            time.sleep(2)
            log.info('Restarting replication')
            mysql_lib.start_replication(self.instance,
                                        mysql_lib.REPLICATION_THREAD_SQL)

            for worker in workers:
                worker.join()

            if not self.dbs_to_backup.empty():
                raise Exception('All worker processes have completed, but '
                                'work remains in the queue')

            log.info('CSV backup is complete, will run a check')
            mysql_backup_status.verify_csv_backup(self.instance.replica_type,
                                                  self.datestamp,
                                                  self.instance)
        finally:
            if host_lock_handle:
                log.info('Releasing general host backup lock')
                host_utils.release_flock_lock(host_lock_handle)
Ejemplo n.º 2
0
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """
        host_lock_handle = None
        try:
            log.info('Backup for instance {i} started at {t}'
                     ''.format(t=str(self.timestamp),
                               i=self.instance))
            log.info('Checking heartbeat to make sure replicaiton is not too '
                     'lagged.')
            self.check_replication_for_backup()

            log.info('Taking host backup lock')
            host_lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE)

            log.info('Setting up export directory structure')
            self.setup_and_get_tmp_path()
            log.info('Will temporarily dump inside of {path}'
                     ''.format(path=self.dump_base_path))

            log.info('Releasing any invalid shard backup locks')
            self.ensure_backup_locks_sanity()

            log.info('Deleting old expired locks')
            self.purge_old_expired_locks()

            log.info('Stopping replication SQL thread to get a snapshot')
            mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL)

            workers = []
            for _ in range(multiprocessing.cpu_count() / 2):
                proc = multiprocessing.Process(target=self.mysql_backup_csv_dbs)
                proc.daemon = True
                proc.start()
                workers.append(proc)
            # throw in a sleep to make sure all threads have started dumps
            time.sleep(2)
            log.info('Restarting replication')
            mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL)

            for worker in workers:
                worker.join()

            if not self.dbs_to_backup.empty():
                raise Exception('All worker processes have completed, but '
                                'work remains in the queue')

            log.info('CSV backup is complete, will run a check')
            mysql_backup_status.verify_csv_backup(self.instance.replica_type,
                                                  self.datestamp,
                                                  self.instance)
        finally:
            if host_lock_handle:
                log.info('Releasing general host backup lock')
                host_utils.release_flock_lock(host_lock_handle)
def start_shard_migration(source_replica_set, destination_replica_set,
                          mig_dbs):
    """ Move shards from one replica set to another

    Args:
    source_replica_set - Which replica set to take the shards from
    destination_replica_set - Which replica set to put the shards on
    mig_dbs - A set of databases to be migrated
    """
    # In 2017Q1 shardb and modsharddb will learn how to deal with shard
    # migrations. We will block them for now.
    if source_replica_set.startswith('db') or \
            source_replica_set.startswith('moddb'):
        raise Exception('Sharddb and modsharddb migrations are not yet '
                        'supported')

    if source_replica_set == destination_replica_set:
        raise Exception('Source and destination can not be the same!')
    # Dealing with failures, potentially due to failovers seems scary
    # here. We are intentionally not catching exception as this seems racy
    # and it would be far better for the entire process to fail than to mess
    # with replication during a failover.
    log.info('Requested to migrate from {s} to {d} databases: {db}'
             ''.format(s=source_replica_set,
                       d=destination_replica_set,
                       db=', '.join(mig_dbs)))

    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    source_slave = zk.get_mysql_instance_from_replica_set(
        source_replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)

    if not source_slave:
        source_slave = zk.get_mysql_instance_from_replica_set(
            source_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    log.info('Source host for dumping data {}'.format(source_slave))
    destination_master = zk.get_mysql_instance_from_replica_set(
            destination_replica_set)
    log.info('Destination host for restoring data {}'
             ''.format(destination_master))

    expected_dbs_on_source = zk.get_sharded_dbs_by_replica_set()[source_replica_set]
    non_mig_dbs = mysql_lib.get_dbs(source_slave).difference(mig_dbs)
    unexpected_dbs = mig_dbs.difference(expected_dbs_on_source)
    if unexpected_dbs:
        raise Exception('Unexpected database supplied for migraton: {}'
                        ''.format(unexpected_dbs))

    # Make sure there are no missing or extra shards
    precheck_schema(source_master)
    precheck_schema(destination_master)

    # Check disk space
    required_disk_space = get_required_disk_space(mig_dbs, source_master)
    available_disk_space = disk_space_available_for_migration(destination_master)
    if available_disk_space < required_disk_space:
        raise Exception('Insufficent disk space to migrate, '
                        'available {a}MB, '
                        'requred {r}MB'
                        ''.format(a=available_disk_space,
                                  r=required_disk_space))
    else:
        log.info('Disk space looks ok: '
                 'available {a}MB, '
                 'requred {r}MB'
                 ''.format(a=available_disk_space,
                           r=required_disk_space))

    # Let's take out a lock to make sure we don't have multiple migrations
    # running on the same replica sets (either source or destination).
    lock_id = take_migration_lock(source_replica_set, destination_replica_set,
                                  mig_dbs, non_mig_dbs)
    try:
        if(non_mig_dbs):
            # First we will dump the schema for the shards that are not moving
            log.info('Backing up non-migrating schema: {}'.format(non_mig_dbs))
            no_mig_backup = backup.logical_backup_instance(
                                            source_slave, time.localtime(),
                                            blackhole=True, databases=non_mig_dbs)

        time.sleep(1)
        # And next the metadata db
        log.info('Backing up metadata db: {}'.format(mysql_lib.METADATA_DB))
        metadata_backup = backup.logical_backup_instance(
                                        source_slave, time.localtime(),
                                        databases=[mysql_lib.METADATA_DB])

        time.sleep(1)
        # Next we will backup the data for the shards that are moving
        log.info('Backing up migrating schema data: {}'.format(mig_dbs))
        mig_backup = backup.logical_backup_instance(
                                       source_slave, time.localtime(),
                                       databases=mig_dbs)
    except:
        finish_migration_log(lock_id, STATUS_EXPORT_FAILED)
        raise

    if(non_mig_dbs):
        # Finally import the backups
        log.info('Importing all the blackhole tables')
        mysql_restore.logical_restore(no_mig_backup, destination_master)

    log.info('Import metadata')
    mysql_restore.logical_restore(metadata_backup, destination_master)

    log.info('Setting up replication')
    mysql_lib.change_master(destination_master, source_master,
                            'BOGUS', 0, no_start=True, skip_set_readonly=True,
                            gtid_auto_pos=False)
    mysql_restore.logical_restore(mig_backup, destination_master)

    # add start slave, catchup
    mysql_lib.start_replication(destination_master)
    mysql_lib.wait_for_catch_up(destination_master, migration=True)

    # And update the log/locks
    update_migration_status(lock_id, STATUS_FAILOVER_READY)
    log.info('The migration is ready to be finished by running:')
    log.info('/usr/local/bin/mysql_utils/finish_shard_migration.py {src}'
             ''.format(src=source_replica_set))
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """

        log.info('Backup for instance {i} started at {t}'
                 ''.format(t=str(self.timestamp), i=self.instance))
        log.info('Checking heartbeat to make sure replication is not too '
                 'lagged.')
        self.check_replication_for_backup()

        log.info('Taking host backup lock')
        host_lock = host_utils.bind_lock_socket(backup.CSV_BACKUP_LOCK_SOCKET)

        log.info('Setting up export directory structure')
        self.setup_and_get_tmp_path()
        log.info('Will temporarily dump inside of {path}'
                 ''.format(path=self.dump_base_path))

        log.info('Releasing any invalid shard backup locks')
        self.ensure_backup_locks_sanity()

        log.info('Deleting old expired locks')
        self.purge_old_expired_locks()

        log.info('Stopping replication SQL thread to get a snapshot')
        mysql_lib.stop_replication(self.instance,
                                   mysql_lib.REPLICATION_THREAD_SQL)

        # starting a consistent snapshot here and retrieving the thread ID
        conn = mysql_lib.connect_mysql(self.instance,
                                       backup.USER_ROLE_MYSQLDUMP)
        mysql_lib.start_consistent_snapshot(conn, read_only=True)
        cursor = conn.cursor()
        cursor.execute('SET SESSION wait_timeout=28800')
        cursor.execute("SELECT VARIABLE_VALUE AS conn_id FROM "
                       "INFORMATION_SCHEMA.SESSION_VARIABLES "
                       "WHERE VARIABLE_NAME='pseudo_thread_id'")
        self.session_id = cursor.fetchone()['conn_id']

        workers = []
        for _ in range(multiprocessing.cpu_count() / 2):
            proc = multiprocessing.Process(target=self.mysql_backup_csv_tables)
            proc.daemon = True
            proc.start()
            workers.append(proc)

        # throw in a sleep to make sure all threads have started dumps
        time.sleep(2)
        log.info('Restarting replication')
        mysql_lib.start_replication(self.instance,
                                    mysql_lib.REPLICATION_THREAD_SQL)

        for worker in workers:
            worker.join()

        if not (self.tables_to_backup.empty()
                and self.tables_to_retry.empty()):
            raise Exception('All worker processes have completed, but '
                            'work remains in the queue')

        log.info('CSV backup is complete, will run a check')
        self.release_expired_locks()
        mysql_backup_status.verify_csv_instance_backup(self.instance,
                                                       self.datestamp,
                                                       self.dev_bucket)
        host_utils.release_lock_socket(host_lock)