def start_zookeeper(zk_ips, keyname): """ Creates a monit configuration file and prompts Monit to start ZooKeeper. Args: zk_ips: A list of zookeeper node IPs to start ZooKeeper on. keyname: A string containing the deployment's keyname. """ logging.info("Starting ZooKeeper...") for ip in zk_ips: start_service_cmd = START_SERVICE_SCRIPT + ZK_WATCH_NAME try: ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start ZooKeeper on {}'.format(ip) logging.exception(message) raise ZKInternalException(message) logging.info('Waiting for ZooKeeper to be ready') zk_server_cmd = None for script in ZK_SERVER_CMD_LOCATIONS: if os.path.isfile(script): zk_server_cmd = script break if zk_server_cmd is None: raise ZKInternalException('Unable to find zkServer.sh') status_cmd = '{} status'.format(zk_server_cmd) while ssh(zk_ips[0], keyname, status_cmd, method=subprocess.call) != 0: time.sleep(5) logging.info("Successfully started ZooKeeper.")
def stop_cassandra(db_ips, keyname): """ Stops Cassandra. Args: db_ips: A list of database node IPs to stop Cassandra on. keyname: A string containing the deployment's keyname. """ logging.info("Stopping Cassandra...") for ip in db_ips: stop_service_cmd = STOP_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME try: ssh(ip, keyname, stop_service_cmd) except subprocess.CalledProcessError: logging.error('Unable to stop Cassandra on {}'.format(ip))
def stop_zookeeper(zk_ips, keyname): """ Stops ZooKeeper. Args: zk_ips: A list of zookeeper node IPs to stop ZooKeeper on. keyname: A string containing the deployment's keyname. """ logging.info("Stopping ZooKeeper...") for ip in zk_ips: stop_service_cmd = STOP_SERVICE_SCRIPT + ZK_WATCH_NAME try: ssh(ip, keyname, stop_service_cmd) except subprocess.CalledProcessError: logging.error('Unable to stop ZooKeeper on {}'.format(ip))
def estimate_total_entities(session, db_master, keyname): """ Estimate the total number of entities. Args: session: A cassandra-driver session. db_master: A string containing the IP address of the primary DB node. keyname: A string containing the deployment keyname. Returns: A string containing an entity count. Raises: AppScaleDBError if unable to get a count. """ query = SimpleStatement( 'SELECT COUNT(*) FROM "{}"'.format(dbconstants.APP_ENTITY_TABLE), consistency_level=ConsistencyLevel.ONE ) try: rows = session.execute(query)[0].count return str(rows / len(dbconstants.APP_ENTITY_SCHEMA)) except dbconstants.TRANSIENT_CASSANDRA_ERRORS: stats_cmd = '{nodetool} cfstats {keyspace}.{table}'.format( nodetool=cassandra_interface.NODE_TOOL, keyspace=cassandra_interface.KEYSPACE, table=dbconstants.APP_ENTITY_TABLE) stats = ssh(db_master, keyname, stats_cmd, method=subprocess.check_output) for line in stats.splitlines(): if 'Number of keys (estimate)' in line: return '{} (estimate)'.format(line.split()[-1]) raise dbconstants.AppScaleDBError('Unable to estimate total entities.')
def estimate_total_entities(session, db_master, keyname): """ Estimate the total number of entities. Args: session: A cassandra-driver session. db_master: A string containing the IP address of the primary DB node. keyname: A string containing the deployment keyname. Returns: A string containing an entity count. Raises: AppScaleDBError if unable to get a count. """ query = SimpleStatement('SELECT COUNT(*) FROM "{}"'.format( dbconstants.APP_ENTITY_TABLE), consistency_level=ConsistencyLevel.ONE) try: rows = session.execute(query)[0].count return str(rows / len(dbconstants.APP_ENTITY_SCHEMA)) except dbconstants.TRANSIENT_CASSANDRA_ERRORS: stats_cmd = '{nodetool} cfstats {keyspace}.{table}'.format( nodetool=cassandra_interface.NODE_TOOL, keyspace=cassandra_interface.KEYSPACE, table=dbconstants.APP_ENTITY_TABLE) stats = ssh(db_master, keyname, stats_cmd, method=subprocess.check_output) for line in stats.splitlines(): if 'Number of keys (estimate)' in line: return '{} (estimate)'.format(line.split()[-1]) raise dbconstants.AppScaleDBError('Unable to estimate total entities.')
def backup_data(path, keyname): """ Backup Cassandra snapshot data directories/files. Args: path: A string containing the location to store the backup on each of the DB machines. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logger.info("Starting new db backup.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') for db_ip in db_ips: appscale_utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL)) appscale_utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL)) get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\ format(APPSCALE_DATA_DIR) du_output = appscale_utils.ssh(db_ip, keyname, get_snapshot_size, method=subprocess.check_output) backup_size = sum( int(line.split()[0]) for line in du_output.split('\n') if line) output_dir = '/'.join(path.split('/')[:-1]) + '/' df_output = appscale_utils.ssh(db_ip, keyname, 'df {}'.format(output_dir), method=subprocess.check_output) available = int(df_output.split('\n')[1].split()[3]) if backup_size > available * PADDING_PERCENTAGE: raise BRException('{} has insufficient space: {}/{}'.format( db_ip, available * PADDING_PERCENTAGE, backup_size)) cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\ '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path) appscale_utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir, create_tar)) logger.info("Done with db backup.")
def start_cassandra(db_ips, db_master, keyname, zookeeper_ips): """ Creates a monit configuration file and prompts Monit to start Cassandra. Args: db_ips: A list of database node IPs to start Cassandra on. db_master: The IP address of the DB master. keyname: A string containing the deployment's keyname. zookeeper_ips: The IP addresses of the Zookeeper nodes. Raises: AppScaleDBError if unable to start Cassandra. """ logging.info("Starting Cassandra...") for ip in db_ips: init_config = '{script} --local-ip {ip} --master-ip {db_master} ' \ '--zk-locations {zk_locations}'.format( script=SETUP_CASSANDRA_SCRIPT, ip=ip, db_master=db_master, zk_locations=get_zk_locations_string(zookeeper_ips)) try: ssh(ip, keyname, init_config) except subprocess.CalledProcessError: message = 'Unable to configure Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) try: start_service_cmd = START_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME ssh(ip, keyname, start_service_cmd) except subprocess.CalledProcessError: message = 'Unable to start Cassandra on {}'.format(ip) logging.exception(message) raise dbconstants.AppScaleDBError(message) logging.info('Waiting for Cassandra to be ready') status_cmd = '{} status'.format(cassandra_interface.NODE_TOOL) while ssh(db_master, keyname, status_cmd, method=subprocess.call) != 0: time.sleep(5) logging.info("Successfully started Cassandra.")
def backup_data(path, keyname): """ Backup Cassandra snapshot data directories/files. Args: path: A string containing the location to store the backup on each of the DB machines. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db backup.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') for db_ip in db_ips: appscale_utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL)) appscale_utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL)) get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\ format(APPSCALE_DATA_DIR) du_output = appscale_utils.ssh(db_ip, keyname, get_snapshot_size, method=subprocess.check_output) backup_size = sum(int(line.split()[0]) for line in du_output.split('\n') if line) output_dir = '/'.join(path.split('/')[:-1]) + '/' df_output = appscale_utils.ssh(db_ip, keyname, 'df {}'.format(output_dir), method=subprocess.check_output) available = int(df_output.split('\n')[1].split()[3]) if backup_size > available * PADDING_PERCENTAGE: raise BRException('{} has insufficient space: {}/{}'. format(db_ip, available * PADDING_PERCENTAGE, backup_size)) cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\ '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path) appscale_utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir, create_tar)) logging.info("Done with db backup.")
def equalize(node1, node2): """ Move data from the node with a larger load to the other one. Args: node1: A dictionary representing a node. node2: A dictionary representing a neighbor of node1. """ keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')] keyname = keys[0].split('.')[0] to_move = abs(node1['load'] - node2['load']) / 2 mb_to_move = round(to_move / 1024**2, 2) if node1['load'] > node2['load']: logger.info('Moving {} MiB from {} to {}'.format( mb_to_move, node1['ip'], node2['ip'])) percentile = 100 - int((to_move / node1['load']) * 100) new_token = ssh(node1['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [new_token, node1['token']] cleanup_ip = node1['ip'] else: logger.info('Moving {} MiB from {} to {}'.format( mb_to_move, node2['ip'], node1['ip'])) percentile = int((to_move / node2['load']) * 100) new_token = ssh(node2['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [node1['token'], new_token] cleanup_ip = node2['ip'] logger.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...')) ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token)) start = repair[0][:60] + '...' end = repair[1][:60] + '...' logger.info('Repairing {} to {}'.format(start, end)) check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]]) logger.info('Cleaning up {}'.format(cleanup_ip)) ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
def equalize(node1, node2): """ Move data from the node with a larger load to the other one. Args: node1: A dictionary representing a node. node2: A dictionary representing a neighbor of node1. """ keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')] keyname = keys[0].split('.')[0] to_move = abs(node1['load'] - node2['load']) / 2 mb_to_move = round(to_move / 1024 ** 2, 2) if node1['load'] > node2['load']: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node1['ip'], node2['ip'])) percentile = 100 - int((to_move / node1['load']) * 100) new_token = ssh(node1['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [new_token, node1['token']] cleanup_ip = node1['ip'] else: logging.info('Moving {} MiB from {} to {}'.format( mb_to_move, node2['ip'], node1['ip'])) percentile = int((to_move / node2['load']) * 100) new_token = ssh(node2['ip'], keyname, 'appscale-get-token {}'.format(percentile), method=check_output).strip() repair = [node1['token'], new_token] cleanup_ip = node2['ip'] logging.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...')) ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token)) start = repair[0][:60] + '...' end = repair[1][:60] + '...' logging.info('Repairing {} to {}'.format(start, end)) check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]]) logging.info('Cleaning up {}'.format(cleanup_ip)) ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
def restore_data(path, keyname, force=False): """ Restores the Cassandra backup. Args: path: A string containing the location on each of the DB machines to use for restoring data. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logging.info("Starting new db restore.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') machines_without_restore = [] for db_ip in db_ips: exit_code = appscale_utils.ssh(db_ip, keyname, 'ls {}'.format(path), method=subprocess.call) if exit_code != ExitCodes.SUCCESS: machines_without_restore.append(db_ip) if machines_without_restore and not force: logging.info('The following machines do not have a restore file: {}'. format(machines_without_restore)) response = raw_input('Would you like to continue? [y/N] ') if response not in ['Y', 'y']: return for db_ip in db_ips: logging.info('Stopping Cassandra on {}'.format(db_ip)) summary = appscale_utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries = SERVICE_RETRIES while status != MonitStates.UNMONITORED: appscale_utils.ssh(db_ip, keyname, 'monit stop {}'.format(CASSANDRA_MONIT_WATCH_NAME), method=subprocess.call) time.sleep(3) summary = appscale_utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries -= 1 if retries < 0: raise BRException('Unable to stop Cassandra') cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: logging.info('Restoring Cassandra data on {}'.format(db_ip)) clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\ format(cassandra_dir) appscale_utils.ssh(db_ip, keyname, clear_db) if db_ip not in machines_without_restore: appscale_utils.ssh(db_ip, keyname, 'tar xf {} -C {}'.format(path, cassandra_dir)) appscale_utils.ssh(db_ip, keyname, 'chown -R cassandra {}'.format(cassandra_dir)) logging.info('Starting Cassandra on {}'.format(db_ip)) retries = SERVICE_RETRIES status = MonitStates.UNMONITORED while status != MonitStates.RUNNING: appscale_utils.ssh(db_ip, keyname, 'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME), method=subprocess.call) time.sleep(3) summary = appscale_utils.ssh(db_ip, keyname, 'monit summary', method=subprocess.check_output) status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME) retries -= 1 if retries < 0: raise BRException('Unable to start Cassandra') appscale_utils.ssh(db_ip, keyname, 'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME)) logging.info('Waiting for Cassandra cluster to be ready') db_ip = db_ips[0] deadline = time.time() + SCHEMA_CHANGE_TIMEOUT while True: ready = True try: output = appscale_utils.ssh( db_ip, keyname, '{} status'.format(NODE_TOOL), method=subprocess.check_output) nodes_ready = len([line for line in output.split('\n') if line.startswith('UN')]) if nodes_ready < len(db_ips): ready = False except CalledProcessError: ready = False if ready: break if time.time() > deadline: logging.warning('Cassandra cluster still not ready.') break time.sleep(3) logging.info("Done with db restore.")
if __name__ == "__main__": logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = init_parser() args = parser.parse_args() status = {'status': 'inProgress', 'message': 'Starting services'} write_to_json_file(status, args.log_postfix) db_access = None zookeeper = None try: # Ensure monit is running. relevant_ips = set(args.zookeeper) | set(args.database) for ip in relevant_ips: ssh(ip, args.keyname, 'service monit start') start_zookeeper(args.zookeeper, args.keyname) conn = KazooClient(hosts=",".join(args.zookeeper)) conn.start() if not conn.exists(ZK_CASSANDRA_CONFIG): conn.create(ZK_CASSANDRA_CONFIG, json.dumps({"num_tokens":256}), makepath=True) start_cassandra(args.database, args.db_master, args.keyname, args.zookeeper) datastore_upgrade.wait_for_quorum( args.keyname, args.db_master, len(args.database), args.replication) db_access = DatastoreProxy(hosts=args.database) # Exit early if a data layout upgrade is not needed. if db_access.valid_data_version_sync(): status = {'status': 'complete', 'message': 'The data layout is valid'}
if __name__ == "__main__": logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = init_parser() args = parser.parse_args() status = {'status': 'inProgress', 'message': 'Starting services'} write_to_json_file(status, args.log_postfix) db_access = None zookeeper = None try: # Ensure monit is running. relevant_ips = set(args.zookeeper) | set(args.database) for ip in relevant_ips: ssh(ip, args.keyname, 'service monit start') start_zookeeper(args.zookeeper, args.keyname) conn = KazooClient(hosts=",".join(args.zookeeper)) conn.start() if not conn.exists(ZK_CASSANDRA_CONFIG): conn.create(ZK_CASSANDRA_CONFIG, json.dumps({"num_tokens": 256}), makepath=True) start_cassandra(args.database, args.db_master, args.keyname, args.zookeeper) datastore_upgrade.wait_for_quorum(args.keyname, args.db_master, len(args.database), args.replication) db_access = DatastoreProxy(hosts=args.database) # Exit early if a data layout upgrade is not needed.
def restore_data(path, keyname, force=False): """ Restores the Cassandra backup. Args: path: A string containing the location on each of the DB machines to use for restoring data. keyname: A string containing the deployment's keyname. Raises: BRException if unable to find any Cassandra machines or if DB machine has insufficient space. """ logger.info("Starting new db restore.") db_ips = appscale_info.get_db_ips() if not db_ips: raise BRException('Unable to find any Cassandra machines.') machines_without_restore = [] for db_ip in db_ips: exit_code = appscale_utils.ssh(db_ip, keyname, 'ls {}'.format(path), method=subprocess.call) if exit_code != utils.ExitCodes.SUCCESS: machines_without_restore.append(db_ip) if machines_without_restore and not force: logger.info( 'The following machines do not have a restore file: {}'.format( machines_without_restore)) response = raw_input('Would you like to continue? [y/N] ') if response not in ['Y', 'y']: return for db_ip in db_ips: logger.info('Stopping Cassandra on {}'.format(db_ip)) summary = appscale_utils.ssh(db_ip, keyname, 'appscale-admin summary', method=subprocess.check_output) status_line = next((line for line in summary.split('\n') if line.startswith(CASSANDRA_MONIT_WATCH_NAME)), '') retries = SERVICE_RETRIES while MonitStates.UNMONITORED not in status_line: appscale_utils.ssh( db_ip, keyname, 'appscale-stop-service {}'.format(CASSANDRA_MONIT_WATCH_NAME), method=subprocess.call) time.sleep(3) summary = appscale_utils.ssh(db_ip, keyname, 'appscale-admin summary', method=subprocess.check_output) status_line = next( (line for line in summary.split('\n') if line.startswith(CASSANDRA_MONIT_WATCH_NAME)), '') retries -= 1 if retries < 0: raise BRException('Unable to stop Cassandra') cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR) for db_ip in db_ips: logger.info('Restoring Cassandra data on {}'.format(db_ip)) clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\ format(cassandra_dir) appscale_utils.ssh(db_ip, keyname, clear_db) if db_ip not in machines_without_restore: appscale_utils.ssh(db_ip, keyname, 'tar xf {} -C {}'.format(path, cassandra_dir)) appscale_utils.ssh(db_ip, keyname, 'chown -R cassandra {}'.format(cassandra_dir)) logger.info('Starting Cassandra on {}'.format(db_ip)) retries = SERVICE_RETRIES status_line = MonitStates.UNMONITORED while MonitStates.RUNNING not in status_line: appscale_utils.ssh( db_ip, keyname, 'appscale-start-service {}'.format(CASSANDRA_MONIT_WATCH_NAME), method=subprocess.call) time.sleep(3) summary = appscale_utils.ssh(db_ip, keyname, 'appscale-admin summary', method=subprocess.check_output) status_line = next( (line for line in summary.split('\n') if line.startswith(CASSANDRA_MONIT_WATCH_NAME)), '') retries -= 1 if retries < 0: raise BRException('Unable to start Cassandra') appscale_utils.ssh( db_ip, keyname, 'appscale-start-service {}'.format(CASSANDRA_MONIT_WATCH_NAME)) logger.info('Waiting for Cassandra cluster to be ready') db_ip = db_ips[0] deadline = time.time() + SCHEMA_CHANGE_TIMEOUT while True: ready = True try: output = appscale_utils.ssh(db_ip, keyname, '{} status'.format(NODE_TOOL), method=subprocess.check_output) nodes_ready = len( [line for line in output.split('\n') if line.startswith('UN')]) if nodes_ready < len(db_ips): ready = False except CalledProcessError: ready = False if ready: break if time.time() > deadline: logger.warning('Cassandra cluster still not ready.') break time.sleep(3) logger.info("Done with db restore.")