Beispiel #1
0
def start_zookeeper(zk_ips, keyname):
  """ Creates a monit configuration file and prompts Monit to start ZooKeeper.
    Args:
      zk_ips: A list of zookeeper node IPs to start ZooKeeper on.
      keyname: A string containing the deployment's keyname.
    """
  logging.info("Starting ZooKeeper...")
  for ip in zk_ips:
    start_service_cmd = START_SERVICE_SCRIPT + ZK_WATCH_NAME
    try:
      utils.ssh(ip, keyname, start_service_cmd)
    except subprocess.CalledProcessError:
      message = 'Unable to start ZooKeeper on {}'.format(ip)
      logging.exception(message)
      raise ZKInternalException(message)

  logging.info('Waiting for ZooKeeper to be ready')
  zk_server_cmd = None
  for script in ZK_SERVER_CMD_LOCATIONS:
    if os.path.isfile(script):
      zk_server_cmd = script
      break
  if zk_server_cmd is None:
    raise ZKInternalException('Unable to find zkServer.sh')

  status_cmd = '{} status'.format(zk_server_cmd)
  while (utils.ssh(zk_ips[0], keyname, status_cmd,
                   method=subprocess.call) != 0):
    time.sleep(5)

  logging.info("Successfully started ZooKeeper.")
def start_cassandra(db_ips, db_master, keyname):
  """ Creates a monit configuration file and prompts Monit to start Cassandra.
  Args:
    db_ips: A list of database node IPs to start Cassandra on.
    db_master: The IP address of the DB master.
    keyname: A string containing the deployment's keyname.
  Raises:
    AppScaleDBError if unable to start Cassandra.
  """
  logging.info("Starting Cassandra...")
  for ip in db_ips:
    init_config = '{script} --local-ip {ip} --master-ip {db_master}'.format(
      script=SETUP_CASSANDRA_SCRIPT, ip=ip, db_master=db_master)
    try:
      utils.ssh(ip, keyname, init_config)
    except subprocess.CalledProcessError:
      message = 'Unable to configure Cassandra on {}'.format(ip)
      logging.exception(message)
      raise dbconstants.AppScaleDBError(message)

    try:
      start_service_cmd = START_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME
      utils.ssh(ip, keyname, start_service_cmd)
    except subprocess.CalledProcessError:
      message = 'Unable to start Cassandra on {}'.format(ip)
      logging.exception(message)
      raise dbconstants.AppScaleDBError(message)

    logging.info("Successfully started Cassandra.")
def start_cassandra(db_ips, db_master, keyname):
    """ Creates a monit configuration file and prompts Monit to start Cassandra.
  Args:
    db_ips: A list of database node IPs to start Cassandra on.
    db_master: The IP address of the DB master.
    keyname: A string containing the deployment's keyname.
  Raises:
    AppScaleDBError if unable to start Cassandra.
  """
    logging.info("Starting Cassandra...")
    for ip in db_ips:
        init_config = '{script} --local-ip {ip} --master-ip {db_master}'.format(
            script=SETUP_CASSANDRA_SCRIPT, ip=ip, db_master=db_master)
        try:
            utils.ssh(ip, keyname, init_config)
        except subprocess.CalledProcessError:
            message = 'Unable to configure Cassandra on {}'.format(ip)
            logging.exception(message)
            raise dbconstants.AppScaleDBError(message)

        try:
            start_service_cmd = START_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME
            utils.ssh(ip, keyname, start_service_cmd)
        except subprocess.CalledProcessError:
            message = 'Unable to start Cassandra on {}'.format(ip)
            logging.exception(message)
            raise dbconstants.AppScaleDBError(message)

    logging.info('Waiting for Cassandra to be ready')
    status_cmd = '{} status'.format(cassandra_interface.NODE_TOOL)
    while (utils.ssh(db_master, keyname, status_cmd, method=subprocess.call) !=
           0):
        time.sleep(5)

    logging.info("Successfully started Cassandra.")
def stop_cassandra(db_ips, keyname):
    """ Stops Cassandra.
  Args:
    db_ips: A list of database node IPs to stop Cassandra on.
    keyname: A string containing the deployment's keyname.
  """
    logging.info("Stopping Cassandra...")
    for ip in db_ips:
        stop_service_cmd = STOP_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME
        try:
            utils.ssh(ip, keyname, stop_service_cmd)
        except subprocess.CalledProcessError:
            logging.error('Unable to stop Cassandra on {}'.format(ip))
def stop_cassandra(db_ips, keyname):
  """ Stops Cassandra.
  Args:
    db_ips: A list of database node IPs to stop Cassandra on.
    keyname: A string containing the deployment's keyname.
  """
  logging.info("Stopping Cassandra...")
  for ip in db_ips:
    stop_service_cmd = STOP_SERVICE_SCRIPT + CASSANDRA_WATCH_NAME
    try:
      utils.ssh(ip, keyname, stop_service_cmd)
    except subprocess.CalledProcessError:
      logging.error('Unable to stop Cassandra on {}'.format(ip))
def stop_zookeeper(zk_ips, keyname):
  """ Stops ZooKeeper.
  Args:
    zk_ips: A list of zookeeper node IPs to stop ZooKeeper on.
    keyname: A string containing the deployment's keyname.
  """
  logging.info("Stopping ZooKeeper...")
  for ip in zk_ips:
    stop_service_cmd = STOP_SERVICE_SCRIPT + ZK_WATCH_NAME
    try:
      utils.ssh(ip, keyname, stop_service_cmd)
    except subprocess.CalledProcessError:
      logging.error('Unable to stop ZooKeeper on {}'.format(ip))
def stop_zookeeper(zk_ips, keyname):
    """ Stops ZooKeeper.
  Args:
    zk_ips: A list of zookeeper node IPs to stop ZooKeeper on.
    keyname: A string containing the deployment's keyname.
  """
    logging.info("Stopping ZooKeeper...")
    for ip in zk_ips:
        stop_service_cmd = STOP_SERVICE_SCRIPT + ZK_WATCH_NAME
        try:
            utils.ssh(ip, keyname, stop_service_cmd)
        except subprocess.CalledProcessError:
            logging.error('Unable to stop ZooKeeper on {}'.format(ip))
Beispiel #8
0
def backup_data(path, keyname):
  """ Backup Zookeeper data to path.

  Args:
    path: A str, the name of the backup file to be created.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any ZooKeeper machines.
  """
  logging.info("Starting new zk backup.")

  running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0
  if not running:
    logging.error('Please start AppScale before backing up ZooKeeper.')
    sys.exit(1)

  # Stop ZooKeeper and backup data on only one ZooKeeper machine.
  # This is to avoid downtime on deployments with multiple ZooKeeper machines.
  zk_ips = appscale_info.get_zk_node_ips()
  if not zk_ips:
    raise BRException('Unable to find any ZooKeeper machines.')
  zk_ip = zk_ips[0]

  timestamp = int(time.time())
  backup_file = '{}/zk_backup_{}.tar.gz'.format(BACKUP_DIR_LOCATION, timestamp)
  try:
    utils.ssh(zk_ip, keyname, 'monit stop -g zookeeper')
    utils.ssh(zk_ip, keyname,
      'tar czf {} -C {} .'.format(backup_file, ZK_DATA_DIR))
    utils.scp_from(zk_ip, keyname, backup_file, path)
  finally:
    utils.ssh(zk_ip, keyname, 'rm -f {}'.format(backup_file))
    utils.ssh(zk_ip, keyname, 'monit start -g zookeeper')
Beispiel #9
0
def backup_data(path, keyname):
    """ Backup Zookeeper data to path.

  Args:
    path: A str, the name of the backup file to be created.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any ZooKeeper machines.
  """
    logging.info("Starting new zk backup.")

    running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0
    if not running:
        logging.error('Please start AppScale before backing up ZooKeeper.')
        sys.exit(1)

    # Stop ZooKeeper and backup data on only one ZooKeeper machine.
    # This is to avoid downtime on deployments with multiple ZooKeeper machines.
    zk_ips = appscale_info.get_zk_node_ips()
    if not zk_ips:
        raise BRException('Unable to find any ZooKeeper machines.')
    zk_ip = zk_ips[0]

    timestamp = int(time.time())
    backup_file = '{}/zk_backup_{}.tar.gz'.format(BACKUP_DIR_LOCATION,
                                                  timestamp)
    try:
        utils.ssh(zk_ip, keyname, 'monit stop -g zookeeper')
        utils.ssh(zk_ip, keyname,
                  'tar czf {} -C {} .'.format(backup_file, ZK_DATA_DIR))
        utils.scp_from(zk_ip, keyname, backup_file, path)
    finally:
        utils.ssh(zk_ip, keyname, 'rm -f {}'.format(backup_file))
        utils.ssh(zk_ip, keyname, 'monit start -g zookeeper')
Beispiel #10
0
def estimate_total_entities(session, db_master, keyname):
  """ Estimate the total number of entities.

  Args:
    session: A cassandra-driver session.
    db_master: A string containing the IP address of the primary DB node.
    keyname: A string containing the deployment keyname.
  Returns:
    A string containing an entity count.
  Raises:
    AppScaleDBError if unable to get a count.
  """
  query = SimpleStatement(
    'SELECT COUNT(*) FROM "{}"'.format(dbconstants.APP_ENTITY_TABLE),
    consistency_level=ConsistencyLevel.ONE
  )
  try:
    rows = session.execute(query)[0].count
    return str(rows / len(dbconstants.APP_ENTITY_SCHEMA))
  except dbconstants.TRANSIENT_CASSANDRA_ERRORS:
    stats_cmd = '{nodetool} cfstats {keyspace}.{table}'.format(
      nodetool=cassandra_interface.NODE_TOOL,
      keyspace=cassandra_interface.KEYSPACE,
      table=dbconstants.APP_ENTITY_TABLE)
    stats = utils.ssh(db_master, keyname, stats_cmd,
                      method=subprocess.check_output)
    for line in stats.splitlines():
      if 'Number of keys (estimate)' in line:
        return '{} (estimate)'.format(line.split()[-1])
  raise dbconstants.AppScaleDBError('Unable to estimate total entities.')
Beispiel #11
0
def start_zookeeper(zk_ips, keyname):
  """ Creates a monit configuration file and prompts Monit to start ZooKeeper.
    Args:
      zk_ips: A list of zookeeper node IPs to start ZooKeeper on.
      keyname: A string containing the deployment's keyname.
    """
  logging.info("Starting ZooKeeper...")
  for ip in zk_ips:
    start_service_cmd = START_SERVICE_SCRIPT + ZK_WATCH_NAME
    try:
      utils.ssh(ip, keyname, start_service_cmd)
    except subprocess.CalledProcessError:
      message = 'Unable to start ZooKeeper on {}'.format(ip)
      logging.exception(message)
      raise ZKInternalException(message)

    logging.info("Successfully started ZooKeeper.")
Beispiel #12
0
def backup_data(path, keyname):
    """ Backup Cassandra snapshot data directories/files.

  Args:
    path: A string containing the location to store the backup on each of the
      DB machines.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any Cassandra machines or if DB machine has
      insufficient space.
  """
    logging.info("Starting new db backup.")

    db_ips = appscale_info.get_db_ips()
    if not db_ips:
        raise BRException('Unable to find any Cassandra machines.')

    for db_ip in db_ips:
        utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL))
        utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL))

        get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\
          format(APPSCALE_DATA_DIR)
        du_output = utils.ssh(db_ip,
                              keyname,
                              get_snapshot_size,
                              method=subprocess.check_output)
        backup_size = sum(
            int(line.split()[0]) for line in du_output.split('\n') if line)

        output_dir = '/'.join(path.split('/')[:-1]) + '/'
        df_output = utils.ssh(db_ip,
                              keyname,
                              'df {}'.format(output_dir),
                              method=subprocess.check_output)
        available = int(df_output.split('\n')[1].split()[3])

        if backup_size > available * PADDING_PERCENTAGE:
            raise BRException('{} has insufficient space: {}/{}'.format(
                db_ip, available * PADDING_PERCENTAGE, backup_size))

    cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR)
    for db_ip in db_ips:
        create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\
          '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path)
        utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir,
                                                       create_tar))

    logging.info("Done with db backup.")
Beispiel #13
0
def backup_data(path, keyname):
  """ Backup Cassandra snapshot data directories/files.

  Args:
    path: A string containing the location to store the backup on each of the
      DB machines.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any Cassandra machines or if DB machine has
      insufficient space.
  """
  logging.info("Starting new db backup.")

  db_ips = appscale_info.get_db_ips()
  if not db_ips:
    raise BRException('Unable to find any Cassandra machines.')

  for db_ip in db_ips:
    utils.ssh(db_ip, keyname, '{} clearsnapshot'.format(NODE_TOOL))
    utils.ssh(db_ip, keyname, '{} snapshot'.format(NODE_TOOL))

    get_snapshot_size = 'find {0} -name "snapshots" -exec du -s {{}} \;'.\
      format(APPSCALE_DATA_DIR)
    du_output = utils.ssh(db_ip, keyname, get_snapshot_size,
      method=subprocess.check_output)
    backup_size = sum(int(line.split()[0])
                      for line in du_output.split('\n') if line)

    output_dir = '/'.join(path.split('/')[:-1]) + '/'
    df_output = utils.ssh(db_ip, keyname, 'df {}'.format(output_dir),
      method=subprocess.check_output)
    available = int(df_output.split('\n')[1].split()[3])

    if backup_size > available * PADDING_PERCENTAGE:
      raise BRException('{} has insufficient space: {}/{}'.
        format(db_ip, available * PADDING_PERCENTAGE, backup_size))

  cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR)
  for db_ip in db_ips:
    create_tar = 'find . -regex ".*/snapshots/[0-9]*/.*" -exec tar '\
      '--transform="s/snapshots\/[0-9]*\///" -cf {0} {{}} +'.format(path)
    utils.ssh(db_ip, keyname, 'cd {} && {}'.format(cassandra_dir, create_tar))

  logging.info("Done with db backup.")
Beispiel #14
0
def equalize(node1, node2):
    """ Move data from the node with a larger load to the other one.

  Args:
    node1: A dictionary representing a node.
    node2: A dictionary representing a neighbor of node1.
  """
    keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')]
    keyname = keys[0].split('.')[0]

    to_move = abs(node1['load'] - node2['load']) / 2
    mb_to_move = round(to_move / 1024**2, 2)
    if node1['load'] > node2['load']:
        logging.info('Moving {} MiB from {} to {}'.format(
            mb_to_move, node1['ip'], node2['ip']))
        percentile = 100 - int((to_move / node1['load']) * 100)
        new_token = ssh(node1['ip'],
                        keyname,
                        'appscale-get-token {}'.format(percentile),
                        method=check_output).strip()
        repair = [new_token, node1['token']]
        cleanup_ip = node1['ip']
    else:
        logging.info('Moving {} MiB from {} to {}'.format(
            mb_to_move, node2['ip'], node1['ip']))
        percentile = int((to_move / node2['load']) * 100)
        new_token = ssh(node2['ip'],
                        keyname,
                        'appscale-get-token {}'.format(percentile),
                        method=check_output).strip()
        repair = [node1['token'], new_token]
        cleanup_ip = node2['ip']

    logging.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...'))
    ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token))

    start = repair[0][:60] + '...'
    end = repair[1][:60] + '...'
    logging.info('Repairing {} to {}'.format(start, end))
    check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]])

    logging.info('Cleaning up {}'.format(cleanup_ip))
    ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
Beispiel #15
0
def equalize(node1, node2):
  """ Move data from the node with a larger load to the other one.

  Args:
    node1: A dictionary representing a node.
    node2: A dictionary representing a neighbor of node1.
  """
  keys = [key for key in os.listdir(KEY_DIRECTORY) if key.endswith('.key')]
  keyname = keys[0].split('.')[0]

  to_move = abs(node1['load'] - node2['load']) / 2
  mb_to_move = round(to_move / 1024 ** 2, 2)
  if node1['load'] > node2['load']:
    logging.info('Moving {} MiB from {} to {}'.format(
      mb_to_move, node1['ip'], node2['ip']))
    percentile = 100 - int((to_move / node1['load']) * 100)
    new_token = ssh(node1['ip'], keyname,
                    'appscale-get-token {}'.format(percentile),
                    method=check_output).strip()
    repair = [new_token, node1['token']]
    cleanup_ip = node1['ip']
  else:
    logging.info('Moving {} MiB from {} to {}'.format(
      mb_to_move, node2['ip'], node1['ip']))
    percentile = int((to_move / node2['load']) * 100)
    new_token = ssh(node2['ip'], keyname,
                    'appscale-get-token {}'.format(percentile),
                    method=check_output).strip()
    repair = [node1['token'], new_token]
    cleanup_ip = node2['ip']

  logging.info('Moving {} to {}'.format(node1['ip'], new_token[:60] + '...'))
  ssh(node1['ip'], keyname, '{} move {}'.format(NODE_TOOL, new_token))

  start = repair[0][:60] + '...'
  end = repair[1][:60] + '...'
  logging.info('Repairing {} to {}'.format(start, end))
  check_output([NODE_TOOL, 'repair', '-st', repair[0], '-et', repair[1]])

  logging.info('Cleaning up {}'.format(cleanup_ip))
  ssh(cleanup_ip, keyname, '{} cleanup'.format(NODE_TOOL))
Beispiel #16
0

if __name__ == "__main__":
    logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
    parser = init_parser()
    args = parser.parse_args()
    status = {'status': 'inProgress', 'message': 'Starting services'}
    write_to_json_file(status, args.log_postfix)

    db_access = None
    zookeeper = None
    try:
        # Ensure monit is running.
        relevant_ips = set(args.zookeeper) | set(args.database)
        for ip in relevant_ips:
            utils.ssh(ip, args.keyname, 'service monit start')

        start_zookeeper(args.zookeeper, args.keyname)
        conn = KazooClient(hosts=",".join(args.zookeeper))
        conn.start()
        if not conn.exists(ZK_CASSANDRA_CONFIG):
            conn.create(ZK_CASSANDRA_CONFIG,
                        json.dumps({"num_tokens": 256}),
                        makepath=True)
        start_cassandra(args.database, args.db_master, args.keyname,
                        args.zookeeper)
        datastore_upgrade.wait_for_quorum(args.keyname, args.db_master,
                                          len(args.database), args.replication)
        db_access = DatastoreProxy(hosts=args.database)

        # Exit early if a data layout upgrade is not needed.
Beispiel #17
0
def restore_data(path, keyname):
    """ Restores the Zookeeper snapshot.

  Args:
    path: A str, the name of the backup file to restore from.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any ZooKeeper machines.
  """
    logging.info("Starting new zk restore.")

    running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0
    if running:
        logging.error('Please stop AppScale before restoring ZooKeeper.')
        sys.exit(1)

    zk_ips = appscale_info.get_zk_node_ips()
    if len(zk_ips) < 1:
        raise BRException('Unable to find any ZooKeeper machines.')

    timestamp = int(time.time())
    restore_file = '{}/zk_restore_{}.tar.gz'.\
      format(BACKUP_DIR_LOCATION, timestamp)

    # Cache name of ZooKeeper service for each machine.
    zk_service_names = {}
    for zk_ip in zk_ips:
        zk_service_names[zk_ip] = utils.zk_service_name(zk_ip, keyname)

    # Copy restore file to and start ZooKeeper on relevant machines.
    logging.info('Copying data to ZooKeeper machines.')
    for zk_ip in zk_ips:
        zk_service = zk_service_names[zk_ip]
        try:
            utils.scp_to(zk_ip, keyname, path, restore_file)
            utils.ssh(zk_ip, keyname, 'service {} restart'.format(zk_service))
        except subprocess.CalledProcessError as error:
            logging.exception('Failed to prepare restore on {}'.format(zk_ip))
            utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
            utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
            raise error

    # Save deployment-specific data.
    deployment_data = StringIO()
    hosts_template = ':{port},'.join(zk_ips) + ':{port}'
    zk = kazoo.client.KazooClient(hosts=hosts_template.format(
        port=zktransaction.DEFAULT_PORT))
    zk.start()
    for zk_node in ZK_KEEP_PATHS:
        recursive_dump(zk, zk_node, deployment_data)
    zk.stop()

    # Stop ZooKeeper and clear existing data directory.
    logging.info('Clearing existing data on ZooKeeper machines.')
    for zk_ip in zk_ips:
        zk_service = zk_service_names[zk_ip]
        try:
            utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
            utils.ssh(zk_ip, keyname, 'rm -rf {}/*'.format(ZK_DATA_DIR))
        except subprocess.CalledProcessError as error:
            logging.exception('Unable to clear data on {}'.format(zk_ip))
            deployment_data.close()
            utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
            utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
            raise error

    # Restore data and restart ZooKeeper on relevant machines.
    logging.info('Restoring data on ZooKeeper machines.')
    for zk_ip in zk_ips:
        zk_service = zk_service_names[zk_ip]
        try:
            utils.ssh(zk_ip, keyname,
                      'tar xzf {} -C {}'.format(restore_file, ZK_DATA_DIR))
            utils.ssh(zk_ip, keyname, 'service {} start'.format(zk_service))
        except subprocess.CalledProcessError as error:
            logging.exception('Unable to restore on {}'.format(zk_ip))
            deployment_data.close()
            utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
            utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
            raise error

    # Restore deployment-specific data.
    logging.info('Restoring deployment-specific data.')
    zk = kazoo.client.KazooClient(hosts=':2181,'.join(zk_ips) + ':2181')
    zk.start()
    for zk_node in ZK_KEEP_PATHS:
        recursive_flush(zk, zk_node)
    deployment_data.seek(0)
    restore_zk(zk, deployment_data)
    zk.stop()

    # Stop ZooKeeper on relevant machines.
    logging.info('Stopping ZooKeeper.')
    for zk_ip in zk_ips:
        zk_service = zk_service_names[zk_ip]
        try:
            utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
            utils.ssh(zk_ip, keyname, 'rm -rf {}'.format(restore_file))
        finally:
            deployment_data.close()

    logging.info("Done with zk restore.")
    return True
Beispiel #18
0
def restore_data(path, keyname):
  """ Restores the Zookeeper snapshot.

  Args:
    path: A str, the name of the backup file to restore from.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any ZooKeeper machines.
  """
  logging.info("Starting new zk restore.")

  running = subprocess.call(['service', CONTROLLER_SERVICE, 'status']) == 0
  if running:
    logging.error('Please stop AppScale before restoring ZooKeeper.')
    sys.exit(1)

  zk_ips = appscale_info.get_zk_node_ips()
  if len(zk_ips) < 1:
    raise BRException('Unable to find any ZooKeeper machines.')

  timestamp = int(time.time())
  restore_file = '{}/zk_restore_{}.tar.gz'.\
    format(BACKUP_DIR_LOCATION, timestamp)

  # Cache name of ZooKeeper service for each machine.
  zk_service_names = {}
  for zk_ip in zk_ips:
    zk_service_names[zk_ip] = utils.zk_service_name(zk_ip, keyname)

  # Copy restore file to and start ZooKeeper on relevant machines.
  logging.info('Copying data to ZooKeeper machines.')
  for zk_ip in zk_ips:
    zk_service = zk_service_names[zk_ip]
    try:
      utils.scp_to(zk_ip, keyname, path, restore_file)
      utils.ssh(zk_ip, keyname, 'service {} restart'.format(zk_service))
    except subprocess.CalledProcessError as error:
      logging.exception('Failed to prepare restore on {}'.format(zk_ip))
      utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
      utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
      raise error

  # Save deployment-specific data.
  deployment_data = StringIO()
  hosts_template = ':{port},'.join(zk_ips) + ':{port}'
  zk = kazoo.client.KazooClient(
    hosts=hosts_template.format(port=zktransaction.DEFAULT_PORT))
  zk.start()
  for zk_node in ZK_KEEP_PATHS:
    recursive_dump(zk, zk_node, deployment_data)
  zk.stop()

  # Stop ZooKeeper and clear existing data directory.
  logging.info('Clearing existing data on ZooKeeper machines.')
  for zk_ip in zk_ips:
    zk_service = zk_service_names[zk_ip]
    try:
      utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
      utils.ssh(zk_ip, keyname, 'rm -rf {}/*'.format(ZK_DATA_DIR))
    except subprocess.CalledProcessError as error:
      logging.exception('Unable to clear data on {}'.format(zk_ip))
      deployment_data.close()
      utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
      utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
      raise error

  # Restore data and restart ZooKeeper on relevant machines.
  logging.info('Restoring data on ZooKeeper machines.')
  for zk_ip in zk_ips:
    zk_service = zk_service_names[zk_ip]
    try:
      utils.ssh(zk_ip, keyname,
        'tar xzf {} -C {}'.format(restore_file, ZK_DATA_DIR))
      utils.ssh(zk_ip, keyname, 'service {} start'.format(zk_service))
    except subprocess.CalledProcessError as error:
      logging.exception('Unable to restore on {}'.format(zk_ip))
      deployment_data.close()
      utils.ssh(zk_ip, keyname, 'rm -f {}'.format(restore_file))
      utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
      raise error

  # Restore deployment-specific data.
  logging.info('Restoring deployment-specific data.')
  zk = kazoo.client.KazooClient(hosts=':2181,'.join(zk_ips) + ':2181')
  zk.start()
  for zk_node in ZK_KEEP_PATHS:
    recursive_flush(zk, zk_node)
  deployment_data.seek(0)
  restore_zk(zk, deployment_data)
  zk.stop()

  # Stop ZooKeeper on relevant machines.
  logging.info('Stopping ZooKeeper.')
  for zk_ip in zk_ips:
    zk_service = zk_service_names[zk_ip]
    try:
      utils.ssh(zk_ip, keyname, 'service {} stop'.format(zk_service))
      utils.ssh(zk_ip, keyname, 'rm -rf {}'.format(restore_file))
    finally:
      deployment_data.close()

  logging.info("Done with zk restore.")
  return True
Beispiel #19
0
def restore_data(path, keyname, force=False):
  """ Restores the Cassandra backup.

  Args:
    path: A string containing the location on each of the DB machines to use
      for restoring data.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any Cassandra machines or if DB machine has
      insufficient space.
  """
  logging.info("Starting new db restore.")

  db_ips = appscale_info.get_db_ips()
  if not db_ips:
    raise BRException('Unable to find any Cassandra machines.')

  machines_without_restore = []
  for db_ip in db_ips:
    exit_code = utils.ssh(db_ip, keyname, 'ls {}'.format(path),
      method=subprocess.call)
    if exit_code != ExitCodes.SUCCESS:
      machines_without_restore.append(db_ip)

  if machines_without_restore and not force:
    logging.info('The following machines do not have a restore file: {}'.
      format(machines_without_restore))
    response = raw_input('Would you like to continue? [y/N] ')
    if response not in ['Y', 'y']:
      return

  for db_ip in db_ips:
    logging.info('Stopping Cassandra on {}'.format(db_ip))
    summary = utils.ssh(db_ip, keyname, 'monit summary',
      method=subprocess.check_output)
    status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME)
    retries = SERVICE_STOP_RETRIES
    while status != MonitStates.UNMONITORED:
      utils.ssh(db_ip, keyname,
        'monit stop {}'.format(CASSANDRA_MONIT_WATCH_NAME))
      time.sleep(1)
      summary = utils.ssh(db_ip, keyname, 'monit summary',
        method=subprocess.check_output)
      status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME)
      retries -= 1
      if retries < 0:
        raise BRException('Unable to stop Cassandra')

  cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR)
  for db_ip in db_ips:
    logging.info('Restoring Cassandra data on {}'.format(db_ip))
    clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\
      format(cassandra_dir)
    utils.ssh(db_ip, keyname, clear_db)

    if db_ip not in machines_without_restore:
      utils.ssh(db_ip, keyname, 'tar xf {} -C {}'.format(path, cassandra_dir))

    utils.ssh(db_ip, keyname,
      'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME))

  logging.info("Done with db restore.")
Beispiel #20
0

if __name__ == "__main__":
  logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
  parser = init_parser()
  args = parser.parse_args()
  status = {'status': 'inProgress', 'message': 'Starting services'}
  write_to_json_file(status, args.log_postfix)

  db_access = None
  zookeeper = None
  try:
    # Ensure monit is running.
    relevant_ips = set(args.zookeeper) | set(args.database)
    for ip in relevant_ips:
      utils.ssh(ip, args.keyname, 'service monit start')

    start_cassandra(args.database, args.db_master, args.keyname)
    start_zookeeper(args.zookeeper, args.keyname)
    datastore_upgrade.wait_for_quorum(
      args.keyname, len(args.database), args.replication)
    db_access = datastore_upgrade.get_datastore()

    # Exit early if a data layout upgrade is not needed.
    if db_access.valid_data_version():
      status = {'status': 'complete', 'message': 'The data layout is valid'}
      sys.exit()

    zookeeper = datastore_upgrade.get_zookeeper(args.zookeeper)
    run_datastore_upgrade(db_access, zookeeper, args.keyname, args.log_postfix)
    status = {'status': 'complete', 'message': 'Data layout upgrade complete'}
Beispiel #21
0

if __name__ == "__main__":
    logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
    parser = init_parser()
    args = parser.parse_args()
    status = {"status": "inProgress", "message": "Starting services"}
    write_to_json_file(status, args.log_postfix)

    db_access = None
    zookeeper = None
    try:
        # Ensure monit is running.
        relevant_ips = set(args.zookeeper) | set(args.database)
        for ip in relevant_ips:
            utils.ssh(ip, args.keyname, "service monit start")

        start_cassandra(args.database, args.db_master, args.keyname)
        start_zookeeper(args.zookeeper, args.keyname)
        datastore_upgrade.wait_for_quorum(args.keyname, len(args.database), args.replication)
        db_access = datastore_upgrade.get_datastore()

        # Exit early if a data layout upgrade is not needed.
        if db_access.valid_data_version():
            status = {"status": "complete", "message": "The data layout is valid"}
            sys.exit()

        zookeeper = datastore_upgrade.get_zookeeper(args.zookeeper)
        try:
            total_entities = datastore_upgrade.estimate_total_entities(db_access.session, args.db_master, args.keyname)
        except AppScaleDBError:
Beispiel #22
0
def restore_data(path, keyname, force=False):
    """ Restores the Cassandra backup.

  Args:
    path: A string containing the location on each of the DB machines to use
      for restoring data.
    keyname: A string containing the deployment's keyname.
  Raises:
    BRException if unable to find any Cassandra machines or if DB machine has
      insufficient space.
  """
    logging.info("Starting new db restore.")

    db_ips = appscale_info.get_db_ips()
    if not db_ips:
        raise BRException('Unable to find any Cassandra machines.')

    machines_without_restore = []
    for db_ip in db_ips:
        exit_code = utils.ssh(db_ip,
                              keyname,
                              'ls {}'.format(path),
                              method=subprocess.call)
        if exit_code != ExitCodes.SUCCESS:
            machines_without_restore.append(db_ip)

    if machines_without_restore and not force:
        logging.info(
            'The following machines do not have a restore file: {}'.format(
                machines_without_restore))
        response = raw_input('Would you like to continue? [y/N] ')
        if response not in ['Y', 'y']:
            return

    for db_ip in db_ips:
        logging.info('Stopping Cassandra on {}'.format(db_ip))
        summary = utils.ssh(db_ip,
                            keyname,
                            'monit summary',
                            method=subprocess.check_output)
        status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME)
        retries = SERVICE_STOP_RETRIES
        while status != MonitStates.UNMONITORED:
            utils.ssh(db_ip, keyname,
                      'monit stop {}'.format(CASSANDRA_MONIT_WATCH_NAME))
            time.sleep(1)
            summary = utils.ssh(db_ip,
                                keyname,
                                'monit summary',
                                method=subprocess.check_output)
            status = utils.monit_status(summary, CASSANDRA_MONIT_WATCH_NAME)
            retries -= 1
            if retries < 0:
                raise BRException('Unable to stop Cassandra')

    cassandra_dir = '{}/cassandra'.format(APPSCALE_DATA_DIR)
    for db_ip in db_ips:
        logging.info('Restoring Cassandra data on {}'.format(db_ip))
        clear_db = 'find {0} -regex ".*\.\(db\|txt\|log\)$" -exec rm {{}} \;'.\
          format(cassandra_dir)
        utils.ssh(db_ip, keyname, clear_db)

        if db_ip not in machines_without_restore:
            utils.ssh(db_ip, keyname,
                      'tar xf {} -C {}'.format(path, cassandra_dir))

        utils.ssh(db_ip, keyname,
                  'monit start {}'.format(CASSANDRA_MONIT_WATCH_NAME))

    logging.info("Done with db restore.")