def soak_upgrade_downgrade( package_name, service_name, running_task_count, additional_options={}, timeout_seconds=25*60, wait_for_deployment=True): sdk_cmd.run_cli("package install --cli {} --yes".format(package_name)) version = 'stub-universe' log.info('Upgrading to test version: {} {}'.format(package_name, version)) _upgrade_or_downgrade( package_name, version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment) # Default Universe is at --index=0 version = _get_pkg_version(package_name) log.info('Downgrading to Universe version: {} {}'.format(package_name, version)) _upgrade_or_downgrade( package_name, version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment)
def add_universe_repos(): traceback.print_stack() log.info('Adding universe repos') # prepare needed universe repositories stub_universe_urls = os.environ.get('STUB_UNIVERSE_URL', '').split(',') stub_urls = {} if not stub_universe_urls: return stub_urls log.info('Adding stub URLs: {}'.format(stub_universe_urls)) for url in stub_universe_urls: package_name = 'testpkg-' package_name += ''.join( random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) stub_urls[package_name] = url # clean up any duplicate repositories current_universes = sdk_cmd.run_cli('package repo list --json') for repo in json.loads(current_universes)['repositories']: if repo['uri'] in stub_urls.values(): log.info('Removing duplicate stub URL: {}'.format(repo['uri'])) sdk_cmd.run_cli('package repo remove {}'.format(repo['name'])) # add the needed universe repositories for name, url in stub_urls.items(): log.info('Adding stub URL: {}'.format(url)) sdk_cmd.run_cli('package repo add --index=0 {} {}'.format(name, url)) log.info('Finished adding universe repos') return stub_urls
def get_diagnostics_bundle(item: pytest.Item): rc, _, _ = sdk_cmd.run_raw_cli('node diagnostics create all') if rc: log.error('Diagnostics bundle creation failed.') return @retrying.retry(wait_fixed=5000, stop_max_delay=10 * 60 * 1000, retry_on_result=lambda result: result is None) def wait_for_bundle_file(): rc, stdout, stderr = sdk_cmd.run_raw_cli( 'node diagnostics --status --json') if rc: return None # e.g. { "some-ip": { stuff we want } } status = next(iter(json.loads(stdout).values())) if status['job_progress_percentage'] != 100: return None # e.g. "/var/lib/dcos/dcos-diagnostics/diag-bundles/bundle-2018-01-11-1515698691.zip" return os.path.basename(status['last_bundle_dir']) bundle_filename = wait_for_bundle_file() if bundle_filename: sdk_cmd.run_cli('node diagnostics download {} --location={}'.format( bundle_filename, setup_artifact_path(item, bundle_filename))) else: log.error('Diagnostics bundle didnt finish in time, giving up.')
def _upgrade_or_downgrade( package_name, to_package_version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment): initial_config = get_config(package_name, service_name) task_ids = sdk_tasks.get_task_ids(service_name, '') if sdk_utils.dcos_version_less_than("1.10") or shakedown.ee_version() is None: log.info('Using marathon upgrade flow to upgrade {} {}'.format(package_name, to_package_version)) sdk_marathon.destroy_app(service_name) sdk_install.install( package_name, service_name, running_task_count, additional_options=additional_options, package_version=to_package_version, timeout_seconds=timeout_seconds, wait_for_deployment=wait_for_deployment) else: log.info('Using CLI upgrade flow to upgrade {} {}'.format(package_name, to_package_version)) if additional_options: with tempfile.NamedTemporaryFile() as opts_f: opts_f.write(json.dumps(additional_options).encode('utf-8')) opts_f.flush() # ensure json content is available for the CLI to read below sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={} --options={}'.format(to_package_version, opts_f.name)) else: sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={}'.format(to_package_version)) # we must manually upgrade the package CLI because it's not done automatically in this flow # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...) sdk_cmd.run_cli( 'package install --yes --cli --package-version={} {}'.format(to_package_version, package_name)) if wait_for_deployment: updated_config = get_config(package_name, service_name) if updated_config == initial_config: log.info('No config change detected. Tasks should not be restarted') sdk_tasks.check_tasks_not_updated(service_name, '', task_ids) else: log.info('Checking that all tasks have restarted') sdk_tasks.check_tasks_updated(service_name, '', task_ids) # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via ONCE tasks, without actually completing deployment log.info("Waiting for package={} service={} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
def create_service_account(service_account_name: str, service_account_secret: str) -> None: log.info('Creating service account for account={account} secret={secret}'. format(account=service_account_name, secret=service_account_secret)) log.info('Install cli necessary for security') sdk_cmd.run_cli('package install dcos-enterprise-cli --yes') log.info('Remove any existing service account and/or secret') delete_service_account(service_account_name, service_account_secret) log.info('Create keypair') sdk_cmd.run_cli( 'security org service-accounts keypair private-key.pem public-key.pem') log.info('Create service account') sdk_cmd.run_cli( 'security org service-accounts create -p public-key.pem -d "Service account for integration tests" "{account}"' .format(account=service_account_name)) log.info('Create secret') sdk_cmd.run_cli( 'security secrets create-sa-secret --strict private-key.pem "{account}" "{secret}"' .format(account=service_account_name, secret=service_account_secret)) log.info( 'Service account created for account={account} secret={secret}'.format( account=service_account_name, secret=service_account_secret))
def _get_universe_url(): repositories = json.loads(sdk_cmd.run_cli('package repo list --json'))['repositories'] for repo in repositories: if repo['name'] == 'Universe': log.info("Found Universe URL: {}".format(repo['uri'])) return repo['uri'] assert False, "Unable to find 'Universe' in list of repos: {}".format(repositories)
def delete_service_account(service_account_name: str, service_account_secret: str) -> None: """ Deletes service account with private key that belongs to the service account. """ # ignore any failures: sdk_cmd.run_cli("security org service-accounts delete {name}".format( name=service_account_name)) # Files generated by service-accounts keypair command should get removed for keypair_file in ['private-key.pem', 'public-key.pem']: try: os.unlink(keypair_file) except OSError: pass delete_secret(secret=service_account_secret)
def get_task_ids(): """ This function uses dcos task WITHOUT the JSON options because that can return the wrong user for schedulers """ tasks = sdk_cmd.run_cli('task --all', print_output=False).split('\n') for task_str in tasks[1:]: # First line is the header line task = task_str.split() if len(task) < 5: continue yield task[4]
def _get_kdc_task(task_name: str) -> dict: """ :return (dict): The task object of the KDC app with desired properties to be retrieved by other methods. """ log.info("Getting KDC task") raw_tasks = sdk_cmd.run_cli("task --json") if raw_tasks: tasks = json.loads(raw_tasks) for task in tasks: if task["name"] == task_name: return task raise RuntimeError("Expecting marathon KDC task but no such task found. Running tasks: {tasks}".format( tasks=raw_tasks))
def _get_host_name(host_id: str) -> str: """ Fetches the host name for the host running the KDC app. :param host_id (str): The ID of the host, used to look up the appropriate node. :return (str): Name of the host running the KDC app. """ log.info("Getting hostname") raw_nodes = sdk_cmd.run_cli("node --json") if raw_nodes: nodes = json.loads(raw_nodes) for node in nodes: if "id" in node and node["id"] == host_id: log.info("Host name is %s", node["hostname"]) return node["hostname"] raise RuntimeError("Failed to get name of host running the KDC app: {nodes}")
def get_task_files_for_id(task_id: str) -> dict: try: ls_lines = sdk_cmd.run_cli( 'task ls --long --all {}'.format(task_id)).split('\n') ret = {} for line in ls_lines: match = task_ls_pattern.match(line) if not match: log.warning('Unable to parse line: {}'.format(line)) continue # match.group(1): "4096 ", match.group(2): "Jul 21 22:07", match.group(3): "jre1.8.0_144 " filename = match.group(3).strip() # build timestamp for use in output filename: 'Jul 21 22:07' => '0721_2207' timestamp = time.strftime( '%m%d_%H%M', time.strptime(match.group(2), '%b %d %H:%M')) ret[filename] = timestamp return ret except: log.exception( 'Failed to get list of files for task: {}'.format(task_id)) return {}
def delete_secret(secret: str) -> None: """ Deletes a given secret. """ # ignore any failures: sdk_cmd.run_cli("security secrets delete {}".format(secret))
def _uninstall(package_name, service_name, role=None, service_account=None, zk=None): start = time.time() if sdk_utils.dcos_version_less_than('1.10'): log.info('Uninstalling/janitoring {}'.format(service_name)) try: shakedown.uninstall_package_and_wait(package_name, service_name=service_name) except (dcos.errors.DCOSException, ValueError) as e: log.info('Got exception when uninstalling package, ' + 'continuing with janitor anyway: {}'.format(e)) if 'marathon' in str(e): log.info( 'Detected a probable marathon flake. Raising so retry will trigger.' ) raise janitor_start = time.time() # leading slash removed, other slashes converted to double underscores: deslashed_service_name = service_name.lstrip('/').replace('/', '__') if role is None: role = deslashed_service_name + '-role' if service_account is None: service_account = service_name + '-principal' if zk is None: zk = 'dcos-service-' + deslashed_service_name janitor_cmd = ( 'docker run mesosphere/janitor /janitor.py ' '-r {role} -p {service_account} -z {zk} --auth_token={auth}') shakedown.run_command_on_master( janitor_cmd.format(role=role, service_account=service_account, zk=zk, auth=sdk_cmd.run_cli( 'config show core.dcos_acs_token', print_output=False).strip())) finish = time.time() log.info( 'Uninstall done after pkg({}) + janitor({}) = total({})'.format( shakedown.pretty_duration(janitor_start - start), shakedown.pretty_duration(finish - janitor_start), shakedown.pretty_duration(finish - start))) else: log.info('Uninstalling {}'.format(service_name)) try: shakedown.uninstall_package_and_wait(package_name, service_name=service_name) # service_name may already contain a leading slash: marathon_app_id = '/' + service_name.lstrip('/') log.info( 'Waiting for no deployments for {}'.format(marathon_app_id)) shakedown.deployment_wait(TIMEOUT_SECONDS, marathon_app_id) # wait for service to be gone according to marathon client = shakedown.marathon.create_client() def marathon_dropped_service(): app_ids = [app['id'] for app in client.get_apps()] log.info('Marathon apps: {}'.format(app_ids)) matching_app_ids = [ app_id for app_id in app_ids if app_id == marathon_app_id ] if len(matching_app_ids) > 1: log.warning('Found multiple apps with id {}'.format( marathon_app_id)) return len(matching_app_ids) == 0 log.info('Waiting for no {} Marathon app'.format(marathon_app_id)) shakedown.time_wait(marathon_dropped_service, timeout_seconds=TIMEOUT_SECONDS) except (dcos.errors.DCOSException, ValueError) as e: log.info('Got exception when uninstalling package: {}'.format(e)) if 'marathon' in str(e): log.info( 'Detected a probable marathon flake. Raising so retry will trigger.' ) raise finally: sdk_utils.list_reserved_resources()