def uninstall( package_name, service_name, role=None, service_account=None, zk=None): start = time.time() global _installed_service_names try: _installed_service_names.remove(service_name) except KeyError: pass # allow tests to 'uninstall' up-front log.info('Uninstalling {}'.format(service_name)) try: retried_uninstall_package_and_wait(package_name, service_name=service_name) except Exception: log.info('Got exception when uninstalling {}'.format(service_name)) log.info(traceback.format_exc()) raise finally: log.info('Reserved resources post uninstall:') sdk_utils.list_reserved_resources() cleanup_start = time.time() try: if sdk_utils.dcos_version_less_than('1.10'): log.info('Janitoring {}'.format(service_name)) retried_run_janitor(service_name, role, service_account, zk) else: log.info('Waiting for Marathon app to be removed {}'.format(service_name)) sdk_marathon.retried_wait_for_deployment_and_app_removal( sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS) except Exception: log.info('Got exception when cleaning up {}'.format(service_name)) log.info(traceback.format_exc()) raise finally: log.info('Reserved resources post cleanup:') sdk_utils.list_reserved_resources() finish = time.time() log.info( 'Uninstalled {} after pkg({}) + cleanup({}) = total({})'.format( service_name, shakedown.pretty_duration(cleanup_start - start), shakedown.pretty_duration(finish - cleanup_start), shakedown.pretty_duration(finish - start)))
def uninstall( package_name, service_name, role=None, service_account=None, zk=None): start = time.time() global _installed_service_names try: _installed_service_names.remove(service_name) except KeyError: pass # allow tests to 'uninstall' up-front log.info('Uninstalling {}'.format(service_name)) try: retried_uninstall_package_and_wait(package_name, service_name=service_name) except Exception as e: log.info('Got exception when uninstalling {}'.format(service_name)) log.info(traceback.format_exc()) raise finally: log.info('Reserved resources post uninstall:') sdk_utils.list_reserved_resources() cleanup_start = time.time() try: if sdk_utils.dcos_version_less_than('1.10'): log.info('Janitoring {}'.format(service_name)) retried_run_janitor(service_name, role, service_account, zk) else: log.info('Waiting for Marathon app to be removed {}'.format(service_name)) sdk_marathon.retried_wait_for_deployment_and_app_removal( sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS) except Exception as e: log.info('Got exception when cleaning up {}'.format(service_name)) log.info(traceback.format_exc()) raise finally: log.info('Reserved resources post cleanup:') sdk_utils.list_reserved_resources() finish = time.time() log.info( 'Uninstalled {} after pkg({}) + cleanup({}) = total({})'.format( service_name, shakedown.pretty_duration(cleanup_start - start), shakedown.pretty_duration(finish - cleanup_start), shakedown.pretty_duration(finish - start)))
def uninstall(package_name, service_name): '''Uninstalls the specified service from the cluster, and verifies that its resources and framework were correctly cleaned up after the uninstall has completed. Any agents which are expected to have orphaned resources (e.g. due to being shut down) should be passed to ignore_dead_agent() before triggering the uninstall. ''' start = time.time() log.info('Uninstalling {}'.format(service_name)) try: _retried_uninstall_package_and_wait(package_name, service_name=service_name) except Exception: log.exception('Got exception when uninstalling {}'.format(service_name)) raise cleanup_start = time.time() try: if sdk_utils.dcos_version_less_than('1.10'): # 1.9 and earlier: Run janitor to unreserve resources log.info('Janitoring {}'.format(service_name)) _retried_run_janitor(service_name) else: # 1.10 and later: Wait for uninstall scheduler to finish and be removed by Cosmos log.info('Waiting for Marathon app to be removed {}'.format(service_name)) sdk_marathon.retried_wait_for_deployment_and_app_removal( sdk_marathon.get_app_id(service_name), timeout=TIMEOUT_SECONDS) except Exception: log.exception('Got exception when cleaning up {}'.format(service_name)) raise finish = time.time() log.info( 'Uninstalled {} after pkg({}) + cleanup({}) = total({})'.format( service_name, shakedown.pretty_duration(cleanup_start - start), shakedown.pretty_duration(finish - cleanup_start), shakedown.pretty_duration(finish - start))) # Sanity check: Verify that all resources and the framework have been successfully cleaned up, # and throw an exception if anything is left over (uninstall bug?) _verify_completed_uninstall(service_name) # Finally, remove the service from the installed list (used by sdk_diag) global _installed_service_names try: _installed_service_names.remove(service_name) except KeyError: pass # Expected when tests preemptively uninstall at start of test
def install( package_name, service_name, expected_running_tasks, additional_options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, wait_for_deployment=True): start = time.time() merged_options = get_package_options(additional_options) log.info('Installing {}/{} with options={} version={}'.format( package_name, service_name, merged_options, package_version)) # 1. Install package, wait for tasks, wait for marathon deployment retried_shakedown_install( package_name, service_name, package_version, merged_options, timeout_seconds, expected_running_tasks) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete state. if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via ONCE tasks, without actually completing deployment log.info("Waiting for {}/{} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) log.info('Installed {}/{} after {}'.format( package_name, service_name, shakedown.pretty_duration(time.time() - start)))
def install( package_name, service_name, expected_running_tasks, additional_options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, wait_for_deployment=True, insert_strict_options=True, install_cli=True): start = time.time() # If the package is already installed at this point, fail immediately. if sdk_marathon.app_exists(service_name): raise dcos.errors.DCOSException('Service is already installed: {}'.format(service_name)) if insert_strict_options and sdk_utils.is_strict_mode(): # strict mode requires correct principal and secret to perform install. # see also: sdk_security.py options = merge_dictionaries({ 'service': { 'service_account': 'service-acct', 'principal': 'service-acct', 'service_account_secret': 'secret', 'secret_name': 'secret' } }, additional_options) else: options = additional_options # 1. Install package, wait for tasks, wait for marathon deployment _retried_install_impl( package_name, service_name, expected_running_tasks, options, package_version, timeout_seconds, install_cli) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete # state, or if the thing being installed doesn't have a deployment plan (e.g. standalone app) if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info('Waiting for package={} service={} to finish deployment plan...'.format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) log.info('Installed package={} service={} after {}'.format( package_name, service_name, shakedown.pretty_duration(time.time() - start))) global _installed_service_names _installed_service_names.add(service_name)
def install( package_name, service_name, expected_running_tasks, additional_options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, wait_for_deployment=True, insert_strict_options=True): start = time.time() # If the package is already installed at this point, fail immediately. if sdk_marathon.app_exists(service_name): raise dcos.errors.DCOSException('Service is already installed: {}'.format(service_name)) if insert_strict_options and sdk_utils.is_strict_mode(): # strict mode requires correct principal and secret to perform install. # see also: sdk_security.py options = merge_dictionaries({ 'service': { 'service_account': 'service-acct', 'principal': 'service-acct', 'service_account_secret': 'secret', 'secret_name': 'secret' } }, additional_options) else: options = additional_options # 1. Install package, wait for tasks, wait for marathon deployment _retried_install_impl( package_name, service_name, expected_running_tasks, options, package_version, timeout_seconds) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete # state, or if the thing being installed doesn't have a deployment plan (e.g. standalone app) if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info('Waiting for package={} service={} to finish deployment plan...'.format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) log.info('Installed package={} service={} after {}'.format( package_name, service_name, shakedown.pretty_duration(time.time() - start))) global _installed_service_names _installed_service_names.add(service_name)
def install( package_name, expected_running_tasks, service_name=None, additional_options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, wait_for_deployment=True): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) log.info('Installing {}/{} with options={} version={}'.format( package_name, service_name, merged_options, package_version)) # 1. Install package, wait for tasks, wait for marathon deployment retried_shakedown_install( package_name, package_version, service_name, merged_options, timeout_seconds, expected_running_tasks) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete state. if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info("Waiting for {}/{} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) # given the above wait for plan completion, here we just wait up to 5 minutes if shakedown.dcos_version_less_than("1.9"): log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9", package_name, service_name) else: log.info("Waiting for %s/%s to be suppressed...", package_name, service_name) shakedown.wait_for( lambda: sdk_api.is_suppressed(service_name), noisy=True, timeout_seconds=5 * 60) log.info('Installed {}/{} after {}'.format( package_name, service_name, shakedown.pretty_duration(time.time() - start)))
def install( package_name, service_name, expected_running_tasks, additional_options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, wait_for_deployment=True): start = time.time() merged_options = get_package_options(additional_options) log.info('Installing {}/{} with options={} version={}'.format( package_name, service_name, merged_options, package_version)) # 1. Install package, wait for tasks, wait for marathon deployment retried_shakedown_install( package_name, service_name, package_version, merged_options, timeout_seconds, expected_running_tasks) # 2. Wait for the scheduler to be idle (as implied by deploy plan completion and suppressed bit) # This should be skipped ONLY when it's known that the scheduler will be stuck in an incomplete state. if wait_for_deployment: # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info("Waiting for {}/{} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) # given the above wait for plan completion, here we just wait up to 5 minutes if shakedown.dcos_version_less_than("1.9"): log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9", package_name, service_name) else: log.info("Waiting for %s/%s to be suppressed...", package_name, service_name) shakedown.wait_for( lambda: sdk_api.is_suppressed(service_name), noisy=True, timeout_seconds=5 * 60) log.info('Installed {}/{} after {}'.format( package_name, service_name, shakedown.pretty_duration(time.time() - start)))
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None, check_suppression=True, timeout_seconds=15 * 60): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) sdk_utils.out('Installing {} with options={} version={}'.format( package_name, merged_options, package_version)) # 1. Install package, wait for tasks, wait for marathon deployment shakedown.install_package(package_name, package_version=package_version, service_name=service_name, options_json=merged_options, wait_for_completion=True, timeout_sec=timeout_seconds, expected_running_tasks=running_task_count) # 2. Ensure the framework is suppressed. # # This is only configurable in order to support installs from # Universe during the upgrade_downgrade tests, because currently # the suppression endpoint isn't supported by all frameworks in # Universe. It can be removed once all frameworks rely on # dcos-commons >= 0.13. if check_suppression: sdk_utils.out("Waiting for framework to be suppressed...") shakedown.wait_for(lambda: sdk_api.is_suppressed(service_name), noisy=True, timeout_seconds=5 * 60) sdk_utils.out('Install done after {}'.format( shakedown.pretty_duration(time.time() - start)))
def uninstall(service_name, package_name=None, role=None, principal=None, zk=None): start = time.time() if package_name is None: package_name = service_name if shakedown.dcos_version_less_than("1.10"): sdk_utils.out('Uninstalling/janitoring {}'.format(service_name)) try: shakedown.uninstall_package_and_wait(package_name, service_name=service_name) except (dcos.errors.DCOSException, ValueError) as e: sdk_utils.out('Got exception when uninstalling package, ' + 'continuing with janitor anyway: {}'.format(e)) janitor_start = time.time() # leading slash removed, other slashes converted to double underscores: deslashed_service_name = service_name.lstrip('/').replace('/', '__') if role is None: role = deslashed_service_name + '-role' if principal is None: principal = service_name + '-principal' if zk is None: zk = 'dcos-service-' + deslashed_service_name janitor_cmd = ('docker run mesosphere/janitor /janitor.py ' '-r {role} -p {principal} -z {zk} --auth_token={auth}') shakedown.run_command_on_master( janitor_cmd.format( role=role, principal=principal, zk=zk, auth=shakedown.run_dcos_command( 'config show core.dcos_acs_token')[0].strip())) finish = time.time() sdk_utils.out( 'Uninstall done after pkg({}) + janitor({}) = total({})'.format( shakedown.pretty_duration(janitor_start - start), shakedown.pretty_duration(finish - janitor_start), shakedown.pretty_duration(finish - start))) else: sdk_utils.out('Uninstalling {}'.format(service_name)) try: shakedown.uninstall_package_and_wait(package_name, service_name=service_name) # service_name may already contain a leading slash: marathon_app_id = '/' + service_name.lstrip('/') sdk_utils.out( 'Waiting for no deployments for {}'.format(marathon_app_id)) shakedown.deployment_wait(600, marathon_app_id) # wait for service to be gone according to marathon def marathon_dropped_service(): client = shakedown.marathon.create_client() app_list = client.get_apps() app_ids = [app['id'] for app in app_list] sdk_utils.out('Marathon apps: {}'.format(app_ids)) matching_app_ids = [ app_id for app_id in app_ids if app_id == marathon_app_id ] if len(matching_app_ids) > 1: sdk_utils.out('Found multiple apps with id {}'.format( marathon_app_id)) return len(matching_app_ids) == 0 sdk_utils.out( 'Waiting for no {} Marathon app'.format(marathon_app_id)) shakedown.time_wait(marathon_dropped_service) except (dcos.errors.DCOSException, ValueError) as e: sdk_utils.out( 'Got exception when uninstalling package: {}'.format(e)) finally: sdk_utils.list_reserved_resources()
def _uninstall( package_name, service_name, role=None, service_account=None, zk=None): start = time.time() if shakedown.dcos_version_less_than("1.10"): log.info('Uninstalling/janitoring {}'.format(service_name)) try: shakedown.uninstall_package_and_wait( package_name, service_name=service_name) except (dcos.errors.DCOSException, ValueError) as e: log.info('Got exception when uninstalling package, ' + 'continuing with janitor anyway: {}'.format(e)) if 'marathon' in str(e): log.info('Detected a probable marathon flake. Raising so retry will trigger.') raise janitor_start = time.time() # leading slash removed, other slashes converted to double underscores: deslashed_service_name = service_name.lstrip('/').replace('/', '__') if role is None: role = deslashed_service_name + '-role' if service_account is None: service_account = service_name + '-principal' if zk is None: zk = 'dcos-service-' + deslashed_service_name janitor_cmd = ('docker run mesosphere/janitor /janitor.py ' '-r {role} -p {service_account} -z {zk} --auth_token={auth}') shakedown.run_command_on_master( janitor_cmd.format( role=role, service_account=service_account, zk=zk, auth=sdk_cmd.run_cli('config show core.dcos_acs_token', print_output=False).strip())) finish = time.time() log.info( 'Uninstall done after pkg({}) + janitor({}) = total({})'.format( shakedown.pretty_duration(janitor_start - start), shakedown.pretty_duration(finish - janitor_start), shakedown.pretty_duration(finish - start))) else: log.info('Uninstalling {}'.format(service_name)) try: shakedown.uninstall_package_and_wait( package_name, service_name=service_name) # service_name may already contain a leading slash: marathon_app_id = '/' + service_name.lstrip('/') log.info('Waiting for no deployments for {}'.format(marathon_app_id)) shakedown.deployment_wait(TIMEOUT_SECONDS, marathon_app_id) # wait for service to be gone according to marathon def marathon_dropped_service(): client = shakedown.marathon.create_client() app_list = client.get_apps() app_ids = [app['id'] for app in app_list] log.info('Marathon apps: {}'.format(app_ids)) matching_app_ids = [ app_id for app_id in app_ids if app_id == marathon_app_id ] if len(matching_app_ids) > 1: log.info('Found multiple apps with id {}'.format( marathon_app_id)) return len(matching_app_ids) == 0 log.info('Waiting for no {} Marathon app'.format(marathon_app_id)) shakedown.time_wait(marathon_dropped_service, timeout_seconds=TIMEOUT_SECONDS) except (dcos.errors.DCOSException, ValueError) as e: log.info( 'Got exception when uninstalling package: {}'.format(e)) if 'marathon' in str(e): log.info('Detected a probable marathon flake. Raising so retry will trigger.') raise finally: sdk_utils.list_reserved_resources()
def _uninstall( package_name, service_name, role=None, service_account=None, zk=None): start = time.time() global _installed_service_names try: _installed_service_names.remove(service_name) except KeyError: pass # allow tests to 'uninstall' up-front if sdk_utils.dcos_version_less_than('1.10'): log.info('Uninstalling/janitoring {}'.format(service_name)) try: shakedown.uninstall_package_and_wait( package_name, service_name=service_name) except (dcos.errors.DCOSException, ValueError) as e: log.info('Got exception when uninstalling package, ' + 'continuing with janitor anyway: {}'.format(e)) if 'marathon' in str(e): log.info('Detected a probable marathon flake. Raising so retry will trigger.') raise janitor_start = time.time() # leading slash removed, other slashes converted to double underscores: deslashed_service_name = service_name.lstrip('/').replace('/', '__') if role is None: role = deslashed_service_name + '-role' if service_account is None: service_account = service_name + '-principal' if zk is None: zk = 'dcos-service-' + deslashed_service_name janitor_cmd = ('docker run mesosphere/janitor /janitor.py ' '-r {role} -p {service_account} -z {zk} --auth_token={auth}') shakedown.run_command_on_master( janitor_cmd.format( role=role, service_account=service_account, zk=zk, auth=sdk_cmd.run_cli('config show core.dcos_acs_token', print_output=False).strip())) finish = time.time() log.info( 'Uninstall done after pkg({}) + janitor({}) = total({})'.format( shakedown.pretty_duration(janitor_start - start), shakedown.pretty_duration(finish - janitor_start), shakedown.pretty_duration(finish - start))) else: log.info('Uninstalling {}'.format(service_name)) try: shakedown.uninstall_package_and_wait( package_name, service_name=service_name) # service_name may already contain a leading slash: marathon_app_id = '/' + service_name.lstrip('/') log.info('Waiting for no deployments for {}'.format(marathon_app_id)) shakedown.deployment_wait(TIMEOUT_SECONDS, marathon_app_id) # wait for service to be gone according to marathon client = shakedown.marathon.create_client() def marathon_dropped_service(): app_ids = [app['id'] for app in client.get_apps()] log.info('Marathon apps: {}'.format(app_ids)) matching_app_ids = [ app_id for app_id in app_ids if app_id == marathon_app_id ] if len(matching_app_ids) > 1: log.warning('Found multiple apps with id {}'.format( marathon_app_id)) return len(matching_app_ids) == 0 log.info('Waiting for no {} Marathon app'.format(marathon_app_id)) shakedown.time_wait(marathon_dropped_service, timeout_seconds=TIMEOUT_SECONDS) except (dcos.errors.DCOSException, ValueError) as e: log.info( 'Got exception when uninstalling package: {}'.format(e)) if 'marathon' in str(e): log.info('Detected a probable marathon flake. Raising so retry will trigger.') raise finally: sdk_utils.list_reserved_resources()