def start_zookeeper(zk_ips, keyname): """ Creates a monit configuration file and prompts Monit to start ZooKeeper. Args: zk_ips: A list of zookeeper node IPs to start ZooKeeper on. keyname: A string containing the deployment's keyname. """ logging.info("Starting ZooKeeper...") for ip in zk_ips: start_service_cmd = START_SERVICE_SCRIPT + ZK_WATCH_NAME try: utils.ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start ZooKeeper on {}'.format(ip) logging.exception(message) raise ZKInternalException(message) logging.info('Waiting for ZooKeeper to be ready') zk_server_cmd = None for script in ZK_SERVER_CMD_LOCATIONS: if os.path.isfile(script): zk_server_cmd = script break if zk_server_cmd is None: raise ZKInternalException('Unable to find zkServer.sh') status_cmd = '{} status'.format(zk_server_cmd) while (utils.ssh(zk_ips[0], keyname, status_cmd, method=subprocess.call) != 0): time.sleep(5) logging.info("Successfully started ZooKeeper.")
def start_cassandra(db_ips, db_master, keyname): """ Creates a monit configuration file and prompts Monit to start Cassandra. Args: db_ips: A list of database node IPs to start Cassandra on. db_master: The IP address of the DB master. keyname: A string containing the deployment's keyname. Raises: AppScaleDBError if unable to start Cassandra. """ logging.info("Starting Cassandra...") for ip in db_ips: init_config = '{script} --local-ip {ip} --master-ip {db_master}'.format( script=SETUP_CASSANDRA_SCRIPT, ip=ip, db_master=db_master) try: utils.ssh(ip, keyname, init_config) except subprocess.CalledProcessError: message = 'Unable to configure Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) try: start_service_cmd = START_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME utils.ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) logging.info("Successfully started Cassandra.")
def start_cassandra(db_ips, db_master, keyname): """ Creates a monit configuration file and prompts Monit to start Cassandra. Args: db_ips: A list of database node IPs to start Cassandra on. db_master: The IP address of the DB master. keyname: A string containing the deployment's keyname. Raises: AppScaleDBError if unable to start Cassandra. """ logging.info("Starting Cassandra...") for ip in db_ips: init_config = '{script} --local-ip {ip} --master-ip {db_master}'.format( script=SETUP_CASSANDRA_SCRIPT, ip=ip, db_master=db_master) try: utils.ssh(ip, keyname, init_config) except subprocess.CalledProcessError: message = 'Unable to configure Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) try: start_service_cmd = START_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME utils.ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) logging.info('Waiting for Cassandra to be ready') status_cmd = '{} status'.format(cassandra_interface.NODE_TOOL) while (utils.ssh(db_master, keyname, status_cmd, method=subprocess.call) != 0): time.sleep(5) logging.info("Successfully started Cassandra.")
def stop_cassandra(db_ips, keyname): """ Stops Cassandra. Args: db_ips: A list of database node IPs to stop Cassandra on. keyname: A string containing the deployment's keyname. """ logging.info("Stopping Cassandra...") for ip in db_ips: stop_service_cmd = STOP_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME try: utils.ssh(ip, keyname, stop_service_cmd) except subprocess.CalledProcessError: logging.error('Unable to stop Cassandra on {}'.format(ip))
def stop_zookeeper(zk_ips, keyname): """ Stops ZooKeeper. Args: zk_ips: A list of zookeeper node IPs to stop ZooKeeper on. keyname: A string containing the deployment's keyname. """ logging.info("Stopping ZooKeeper...") for ip in zk_ips: stop_service_cmd = STOP_SERVICE_SCRIPT + ZK_WATCH_NAME try: utils.ssh(ip, keyname, stop_service_cmd) except subprocess.CalledProcessError: logging.error('Unable to stop ZooKeeper on {}'.format(ip))
def backup_data(path, keyname): """ Backup Zookeeper data to path. Args: path: A str, the name of the backup file to be created. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any ZooKeeper machines. """ logging.info("Starting new zk backup.") running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0 if not running: logging.error('Please start AppScale before backing up ZooKeeper.') sys.exit(1) # Stop ZooKeeper and backup data on only one ZooKeeper machine. # This is to avoid downtime on deployments with multiple ZooKeeper machines. zk_ips = appscale_info.get_zk_node_ips() if not zk_ips: raise BRException('Unable to find any ZooKeeper machines.') zk_ip = zk_ips[0] timestamp = int(time.time()) backup_file = '{}/zk_backup_{}.tar.gz'.format(BACKUP_DIR_LOCATION, timestamp) try: utils.ssh(zk_ip, keyname, 'monit stop -g zookeeper') utils.ssh(zk_ip, keyname, 'tar czf {} -C {} .'.format(backup_file, ZK_DATA_DIR)) utils.scp_from(zk_ip, keyname, backup_file, path) finally: utils.ssh(zk_ip, keyname, 'rm -f {}'.format(backup_file)) utils.ssh(zk_ip, keyname, 'monit start -g zookeeper')
def estimate_total_entities(session, db_master, keyname): """ Estimate the total number of entities. Args: session: A cassandra-driver session. db_master: A string containing the IP address of the primary DB node. keyname: A string containing the deployment keyname. Returns: A string containing an entity count. Raises: AppScaleDBError if unable to get a count. """ query = SimpleStatement( 'SELECT COUNT(*) FROM "{}"'.format(dbconstants.APP_ENTITY_TABLE), consistency_level=ConsistencyLevel.ONE ) try: rows = session.execute(query)[0].count return str(rows / len(dbconstants.APP_ENTITY_SCHEMA)) except dbconstants.TRANSIENT_CASSANDRA_ERRORS: stats_cmd = '{nodetool} cfstats {keyspace}.{table}'.format( nodetool=cassandra_interface.NODE_TOOL, keyspace=cassandra_interface.KEYSPACE, table=dbconstants.APP_ENTITY_TABLE) stats = utils.ssh(db_master, keyname, stats_cmd, method=subprocess.check_output) for line in stats.splitlines(): if 'Number of keys (estimate)' in line: return '{} (estimate)'.format(line.split()[-1]) raise dbconstants.AppScaleDBError('Unable to estimate total entities.')
def start_zookeeper(zk_ips, keyname): """ Creates a monit configuration file and prompts Monit to start ZooKeeper. Args: zk_ips: A list of zookeeper node IPs to start ZooKeeper on. keyname: A string containing the deployment's keyname. """ logging.info("Starting ZooKeeper...") for ip in zk_ips: start_service_cmd = START_SERVICE_SCRIPT + ZK_WATCH_NAME try: utils.ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start ZooKeeper on {}'.format(ip) logging.exception(message) raise ZKInternalException(message) logging.info("Successfully started ZooKeeper.")
def backup_data(path, keyname): """ Backup Cassandra snapshot data directories/files. Args: path: A string containing the location to store the backup on each of the DB machines. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db backup.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') for db_ip in db_ips: utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL)) utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL)) get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\ format(APPSCALE_DATA_DIR) du_output = utils.ssh(db_ip, keyname, get_snapshot_size, method=subprocess.check_output) backup_size = sum( int(line.split()[0]) for line in du_output.split('\n') if line) output_dir = '/'.join(path.split('/')[:-1]) + '/' df_output = utils.ssh(db_ip, keyname, 'df {}'.format(output_dir), method=subprocess.check_output) available = int(df_output.split('\n')[1].split()[3]) if backup_size > available * PADDING_PERCENTAGE: raise BRException('{} has insufficient space: {}/{}'.format( db_ip, available * PADDING_PERCENTAGE, backup_size)) cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\ '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path) utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir, create_tar)) logging.info("Done with db backup.")
def backup_data(path, keyname): """ Backup Cassandra snapshot data directories/files. Args: path: A string containing the location to store the backup on each of the DB machines. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db backup.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') for db_ip in db_ips: utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL)) utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL)) get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\ format(APPSCALE_DATA_DIR) du_output = utils.ssh(db_ip, keyname, get_snapshot_size, method=subprocess.check_output) backup_size = sum(int(line.split()[0]) for line in du_output.split('\n') if line) output_dir = '/'.join(path.split('/')[:-1]) + '/' df_output = utils.ssh(db_ip, keyname, 'df {}'.format(output_dir), method=subprocess.check_output) available = int(df_output.split('\n')[1].split()[3]) if backup_size > available * PADDING_PERCENTAGE: raise BRException('{} has insufficient space: {}/{}'. format(db_ip, available * PADDING_PERCENTAGE, backup_size)) cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\ '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path) utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir, create_tar)) logging.info("Done with db backup.")
def equalize(node1, node2): """ Move data from the node with a larger load to the other one. Args: node1: A dictionary representing a node. node2: A dictionary representing a neighbor of node1. """ keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')] keyname = keys[0].split('.')[0] to_move = abs(node1['load'] - node2['load']) / 2 mb_to_move = round(to_move / 1024**2, 2) if node1['load'] > node2['load']: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node1['ip'], node2['ip'])) percentile = 100 - int((to_move / node1['load']) * 100) new_token = ssh(node1['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [new_token, node1['token']] cleanup_ip = node1['ip'] else: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node2['ip'], node1['ip'])) percentile = int((to_move / node2['load']) * 100) new_token = ssh(node2['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [node1['token'], new_token] cleanup_ip = node2['ip'] logging.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...')) ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token)) start = repair[0][:60] + '...' end = repair[1][:60] + '...' logging.info('Repairing {} to {}'.format(start, end)) check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]]) logging.info('Cleaning up {}'.format(cleanup_ip)) ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
def equalize(node1, node2): """ Move data from the node with a larger load to the other one. Args: node1: A dictionary representing a node. node2: A dictionary representing a neighbor of node1. """ keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')] keyname = keys[0].split('.')[0] to_move = abs(node1['load'] - node2['load']) / 2 mb_to_move = round(to_move / 1024 ** 2, 2) if node1['load'] > node2['load']: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node1['ip'], node2['ip'])) percentile = 100 - int((to_move / node1['load']) * 100) new_token = ssh(node1['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [new_token, node1['token']] cleanup_ip = node1['ip'] else: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node2['ip'], node1['ip'])) percentile = int((to_move / node2['load']) * 100) new_token = ssh(node2['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [node1['token'], new_token] cleanup_ip = node2['ip'] logging.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...')) ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token)) start = repair[0][:60] + '...' end = repair[1][:60] + '...' logging.info('Repairing {} to {}'.format(start, end)) check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]]) logging.info('Cleaning up {}'.format(cleanup_ip)) ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
if __name__ == "__main__": logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = init_parser() args = parser.parse_args() status = {'status': 'inProgress', 'message': 'Starting services'} write_to_json_file(status, args.log_postfix) db_access = None zookeeper = None try: # Ensure monit is running. relevant_ips = set(args.zookeeper) | set(args.database) for ip in relevant_ips: utils.ssh(ip, args.keyname, 'service monit start') start_zookeeper(args.zookeeper, args.keyname) conn = KazooClient(hosts=",".join(args.zookeeper)) conn.start() if not conn.exists(ZK_CASSANDRA_CONFIG): conn.create(ZK_CASSANDRA_CONFIG, json.dumps({"num_tokens": 256}), makepath=True) start_cassandra(args.database, args.db_master, args.keyname, args.zookeeper) datastore_upgrade.wait_for_quorum(args.keyname, args.db_master, len(args.database), args.replication) db_access = DatastoreProxy(hosts=args.database) # Exit early if a data layout upgrade is not needed.
def restore_data(path, keyname): """ Restores the Zookeeper snapshot. Args: path: A str, the name of the backup file to restore from. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any ZooKeeper machines. """ logging.info("Starting new zk restore.") running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0 if running: logging.error('Please stop AppScale before restoring ZooKeeper.') sys.exit(1) zk_ips = appscale_info.get_zk_node_ips() if len(zk_ips) < 1: raise BRException('Unable to find any ZooKeeper machines.') timestamp = int(time.time()) restore_file = '{}/zk_restore_{}.tar.gz'.\ format(BACKUP_DIR_LOCATION, timestamp) # Cache name of ZooKeeper service for each machine. zk_service_names = {} for zk_ip in zk_ips: zk_service_names[zk_ip] = utils.zk_service_name(zk_ip, keyname) # Copy restore file to and start ZooKeeper on relevant machines. logging.info('Copying data to ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.scp_to(zk_ip, keyname, path, restore_file) utils.ssh(zk_ip, keyname, 'service {} restart'.format(zk_service)) except subprocess.CalledProcessError as error: logging.exception('Failed to prepare restore on {}'.format(zk_ip)) utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Save deployment-specific data. deployment_data = StringIO() hosts_template = ':{port},'.join(zk_ips) + ':{port}' zk = kazoo.client.KazooClient(hosts=hosts_template.format( port=zktransaction.DEFAULT_PORT)) zk.start() for zk_node in ZK_KEEP_PATHS: recursive_dump(zk, zk_node, deployment_data) zk.stop() # Stop ZooKeeper and clear existing data directory. logging.info('Clearing existing data on ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) utils.ssh(zk_ip, keyname, 'rm -rf {}/*'.format(ZK_DATA_DIR)) except subprocess.CalledProcessError as error: logging.exception('Unable to clear data on {}'.format(zk_ip)) deployment_data.close() utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Restore data and restart ZooKeeper on relevant machines. logging.info('Restoring data on ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'tar xzf {} -C {}'.format(restore_file, ZK_DATA_DIR)) utils.ssh(zk_ip, keyname, 'service {} start'.format(zk_service)) except subprocess.CalledProcessError as error: logging.exception('Unable to restore on {}'.format(zk_ip)) deployment_data.close() utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Restore deployment-specific data. logging.info('Restoring deployment-specific data.') zk = kazoo.client.KazooClient(hosts=':2181,'.join(zk_ips) + ':2181') zk.start() for zk_node in ZK_KEEP_PATHS: recursive_flush(zk, zk_node) deployment_data.seek(0) restore_zk(zk, deployment_data) zk.stop() # Stop ZooKeeper on relevant machines. logging.info('Stopping ZooKeeper.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) utils.ssh(zk_ip, keyname, 'rm -rf {}'.format(restore_file)) finally: deployment_data.close() logging.info("Done with zk restore.") return True
def restore_data(path, keyname): """ Restores the Zookeeper snapshot. Args: path: A str, the name of the backup file to restore from. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any ZooKeeper machines. """ logging.info("Starting new zk restore.") running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0 if running: logging.error('Please stop AppScale before restoring ZooKeeper.') sys.exit(1) zk_ips = appscale_info.get_zk_node_ips() if len(zk_ips) < 1: raise BRException('Unable to find any ZooKeeper machines.') timestamp = int(time.time()) restore_file = '{}/zk_restore_{}.tar.gz'.\ format(BACKUP_DIR_LOCATION, timestamp) # Cache name of ZooKeeper service for each machine. zk_service_names = {} for zk_ip in zk_ips: zk_service_names[zk_ip] = utils.zk_service_name(zk_ip, keyname) # Copy restore file to and start ZooKeeper on relevant machines. logging.info('Copying data to ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.scp_to(zk_ip, keyname, path, restore_file) utils.ssh(zk_ip, keyname, 'service {} restart'.format(zk_service)) except subprocess.CalledProcessError as error: logging.exception('Failed to prepare restore on {}'.format(zk_ip)) utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Save deployment-specific data. deployment_data = StringIO() hosts_template = ':{port},'.join(zk_ips) + ':{port}' zk = kazoo.client.KazooClient( hosts=hosts_template.format(port=zktransaction.DEFAULT_PORT)) zk.start() for zk_node in ZK_KEEP_PATHS: recursive_dump(zk, zk_node, deployment_data) zk.stop() # Stop ZooKeeper and clear existing data directory. logging.info('Clearing existing data on ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) utils.ssh(zk_ip, keyname, 'rm -rf {}/*'.format(ZK_DATA_DIR)) except subprocess.CalledProcessError as error: logging.exception('Unable to clear data on {}'.format(zk_ip)) deployment_data.close() utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Restore data and restart ZooKeeper on relevant machines. logging.info('Restoring data on ZooKeeper machines.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'tar xzf {} -C {}'.format(restore_file, ZK_DATA_DIR)) utils.ssh(zk_ip, keyname, 'service {} start'.format(zk_service)) except subprocess.CalledProcessError as error: logging.exception('Unable to restore on {}'.format(zk_ip)) deployment_data.close() utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file)) utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) raise error # Restore deployment-specific data. logging.info('Restoring deployment-specific data.') zk = kazoo.client.KazooClient(hosts=':2181,'.join(zk_ips) + ':2181') zk.start() for zk_node in ZK_KEEP_PATHS: recursive_flush(zk, zk_node) deployment_data.seek(0) restore_zk(zk, deployment_data) zk.stop() # Stop ZooKeeper on relevant machines. logging.info('Stopping ZooKeeper.') for zk_ip in zk_ips: zk_service = zk_service_names[zk_ip] try: utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service)) utils.ssh(zk_ip, keyname, 'rm -rf {}'.format(restore_file)) finally: deployment_data.close() logging.info("Done with zk restore.") return True
def restore_data(path, keyname, force=False): """ Restores the Cassandra backup. Args: path: A string containing the location on each of the DB machines to use for restoring data. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db restore.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') machines_without_restore = [] for db_ip in db_ips: exit_code = utils.ssh(db_ip, keyname, 'ls {}'.format(path), method=subprocess.call) if exit_code != ExitCodes.SUCCESS: machines_without_restore.append(db_ip) if machines_without_restore and not force: logging.info('The following machines do not have a restore file: {}'. format(machines_without_restore)) response = raw_input('Would you like to continue? [y/N] ') if response not in ['Y', 'y']: return for db_ip in db_ips: logging.info('Stopping Cassandra on {}'.format(db_ip)) summary = utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries = SERVICE_STOP_RETRIES while status != MonitStates.UNMONITORED: utils.ssh(db_ip, keyname, 'monit stop {}'.format(CASSANDRA_MONIT_WATCH_NAME)) time.sleep(1) summary = utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries -= 1 if retries < 0: raise BRException('Unable to stop Cassandra') cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: logging.info('Restoring Cassandra data on {}'.format(db_ip)) clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\ format(cassandra_dir) utils.ssh(db_ip, keyname, clear_db) if db_ip not in machines_without_restore: utils.ssh(db_ip, keyname, 'tar xf {} -C {}'.format(path, cassandra_dir)) utils.ssh(db_ip, keyname, 'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME)) logging.info("Done with db restore.")
if __name__ == "__main__": logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = init_parser() args = parser.parse_args() status = {'status': 'inProgress', 'message': 'Starting services'} write_to_json_file(status, args.log_postfix) db_access = None zookeeper = None try: # Ensure monit is running. relevant_ips = set(args.zookeeper) | set(args.database) for ip in relevant_ips: utils.ssh(ip, args.keyname, 'service monit start') start_cassandra(args.database, args.db_master, args.keyname) start_zookeeper(args.zookeeper, args.keyname) datastore_upgrade.wait_for_quorum( args.keyname, len(args.database), args.replication) db_access = datastore_upgrade.get_datastore() # Exit early if a data layout upgrade is not needed. if db_access.valid_data_version(): status = {'status': 'complete', 'message': 'The data layout is valid'} sys.exit() zookeeper = datastore_upgrade.get_zookeeper(args.zookeeper) run_datastore_upgrade(db_access, zookeeper, args.keyname, args.log_postfix) status = {'status': 'complete', 'message': 'Data layout upgrade complete'}
if __name__ == "__main__": logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = init_parser() args = parser.parse_args() status = {"status": "inProgress", "message": "Starting services"} write_to_json_file(status, args.log_postfix) db_access = None zookeeper = None try: # Ensure monit is running. relevant_ips = set(args.zookeeper) | set(args.database) for ip in relevant_ips: utils.ssh(ip, args.keyname, "service monit start") start_cassandra(args.database, args.db_master, args.keyname) start_zookeeper(args.zookeeper, args.keyname) datastore_upgrade.wait_for_quorum(args.keyname, len(args.database), args.replication) db_access = datastore_upgrade.get_datastore() # Exit early if a data layout upgrade is not needed. if db_access.valid_data_version(): status = {"status": "complete", "message": "The data layout is valid"} sys.exit() zookeeper = datastore_upgrade.get_zookeeper(args.zookeeper) try: total_entities = datastore_upgrade.estimate_total_entities(db_access.session, args.db_master, args.keyname) except AppScaleDBError:
def restore_data(path, keyname, force=False): """ Restores the Cassandra backup. Args: path: A string containing the location on each of the DB machines to use for restoring data. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db restore.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') machines_without_restore = [] for db_ip in db_ips: exit_code = utils.ssh(db_ip, keyname, 'ls {}'.format(path), method=subprocess.call) if exit_code != ExitCodes.SUCCESS: machines_without_restore.append(db_ip) if machines_without_restore and not force: logging.info( 'The following machines do not have a restore file: {}'.format( machines_without_restore)) response = raw_input('Would you like to continue? [y/N] ') if response not in ['Y', 'y']: return for db_ip in db_ips: logging.info('Stopping Cassandra on {}'.format(db_ip)) summary = utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries = SERVICE_STOP_RETRIES while status != MonitStates.UNMONITORED: utils.ssh(db_ip, keyname, 'monit stop {}'.format(CASSANDRA_MONIT_WATCH_NAME)) time.sleep(1) summary = utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries -= 1 if retries < 0: raise BRException('Unable to stop Cassandra') cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: logging.info('Restoring Cassandra data on {}'.format(db_ip)) clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\ format(cassandra_dir) utils.ssh(db_ip, keyname, clear_db) if db_ip not in machines_without_restore: utils.ssh(db_ip, keyname, 'tar xf {} -C {}'.format(path, cassandra_dir)) utils.ssh(db_ip, keyname, 'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME)) logging.info("Done with db restore.")