def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock( backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process( target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def backup_instance(self): """ Back up a replica instance to s3 in csv """ host_lock_handle = None try: log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replicaiton is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process(target=self.mysql_backup_csv_dbs) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not self.dbs_to_backup.empty(): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') mysql_backup_status.verify_csv_backup(self.instance.replica_type, self.datestamp, self.instance) finally: if host_lock_handle: log.info('Releasing general host backup lock') host_utils.release_flock_lock(host_lock_handle)
def backup_instance(self): """ Back up a replica instance to s3 in csv """ log.info('Backup for instance {i} started at {t}' ''.format(t=str(self.timestamp), i=self.instance)) log.info('Checking heartbeat to make sure replication is not too ' 'lagged.') self.check_replication_for_backup() log.info('Taking host backup lock') host_lock = host_utils.bind_lock_socket(backup.CSV_BACKUP_LOCK_SOCKET) log.info('Setting up export directory structure') self.setup_and_get_tmp_path() log.info('Will temporarily dump inside of {path}' ''.format(path=self.dump_base_path)) log.info('Releasing any invalid shard backup locks') self.ensure_backup_locks_sanity() log.info('Deleting old expired locks') self.purge_old_expired_locks() log.info('Stopping replication SQL thread to get a snapshot') mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) # starting a consistent snapshot here and retrieving the thread ID conn = mysql_lib.connect_mysql(self.instance, backup.USER_ROLE_MYSQLDUMP) mysql_lib.start_consistent_snapshot(conn, read_only=True) cursor = conn.cursor() cursor.execute('SET SESSION wait_timeout=28800') cursor.execute("SELECT VARIABLE_VALUE AS conn_id FROM " "INFORMATION_SCHEMA.SESSION_VARIABLES " "WHERE VARIABLE_NAME='pseudo_thread_id'") self.session_id = cursor.fetchone()['conn_id'] workers = [] for _ in range(multiprocessing.cpu_count() / 2): proc = multiprocessing.Process(target=self.mysql_backup_csv_tables) proc.daemon = True proc.start() workers.append(proc) # throw in a sleep to make sure all threads have started dumps time.sleep(2) log.info('Restarting replication') mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL) for worker in workers: worker.join() if not (self.tables_to_backup.empty() and self.tables_to_retry.empty()): raise Exception('All worker processes have completed, but ' 'work remains in the queue') log.info('CSV backup is complete, will run a check') self.release_expired_locks() mysql_backup_status.verify_csv_instance_backup(self.instance, self.datestamp, self.dev_bucket) host_utils.release_lock_socket(host_lock)
def add_replica_to_zk(instance, replica_type, dry_run): """ Add a replica to zk Args: instance - A hostaddr object of the replica to add to zk replica_type - Either 'slave' or 'dr_slave'. dry_run - If set, do not modify zk """ try: if replica_type not in [host_utils.REPLICA_ROLE_DR_SLAVE, host_utils.REPLICA_ROLE_SLAVE]: raise Exception('Invalid value "{}" for argument ' "replica_type").format(replica_type) log.info('Instance is {}'.format(instance)) mysql_lib.assert_replication_sanity(instance) mysql_lib.assert_replication_unlagged( instance, mysql_lib.REPLICATION_TOLERANCE_NORMAL) master = mysql_lib.get_master_from_instance(instance) zk_local = host_utils.MysqlZookeeper() kazoo_client = environment_specific.get_kazoo_client() if not kazoo_client: raise Exception('Could not get a zk connection') if master not in zk_local.get_all_mysql_instances_by_type( host_utils.REPLICA_ROLE_MASTER): raise Exception('Instance {} is not a master in zk' ''.format(master)) log.info('Detected master of {instance} ' 'as {master}'.format(instance=instance, master=master)) replica_set = zk_local.get_replica_set_from_instance(master) log.info('Detected replica_set as {}'.format(replica_set)) old_instance = zk_local.get_mysql_instance_from_replica_set( replica_set, repl_type=replica_type) if replica_type == host_utils.REPLICA_ROLE_SLAVE: (zk_node, parsed_data, version) = get_zk_node_for_replica_set(kazoo_client, replica_set) log.info('Replica set {replica_set} is held in zk_node ' '{zk_node}'.format(zk_node=zk_node, replica_set=replica_set)) log.info('Existing config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) new_data = copy.deepcopy(parsed_data) new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \ instance.hostname new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \ instance.port log.info('New config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(zk_node, simplejson.dumps(new_data), version) elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE: znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK) parsed_data = simplejson.loads(znode_data) new_data = copy.deepcopy(parsed_data) if replica_set in parsed_data: log.info('Existing dr config:') log.info(pprint.pformat(remove_auth(parsed_data[replica_set]))) else: log.info('Replica set did not previously have a dr slave') new_data[replica_set] = \ {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname, 'port': instance.port}} log.info('New dr config:') log.info(pprint.pformat(remove_auth(new_data[replica_set]))) if new_data == parsed_data: raise Exception('No change would be made to zk, ' 'will not write new config') elif dry_run: log.info('dry_run is set, therefore not modifying zk') else: log.info('Pushing new dr configuration for ' '{replica_set}:'.format(replica_set=replica_set)) kazoo_client.set(environment_specific.DR_ZK, simplejson.dumps(new_data), dr_meta.version) else: # we should raise an exception above rather than getting to here pass if not dry_run: log.info('Stopping replication and event scheduler on {} ' 'being taken out of use'.format(old_instance)) try: mysql_lib.stop_replication(old_instance) mysql_lib.stop_event_scheduler(old_instance) except: log.info('Could not stop replication on {}' ''.format(old_instance)) except Exception, e: log.exception(e) raise