def destroy_app(app_name): sdk_cmd.request('delete', api_url_with_param('apps', app_name)) # Make sure the scheduler has been destroyed def fn(): return shakedown.get_service(app_name) is None sdk_spin.time_wait_noisy(lambda: fn())
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least {}s'.format(wait_time)) # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False try: spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time) except shakedown.TimeoutExpired: print('Timeout reached as expected') # add the needed envvars in marathon and confirm that the deployment succeeds: config = marathon.get_config(PACKAGE_NAME) env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_running()
def test_bump_world_cpus(): check_running() world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') print('world ids: ' + str(world_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['WORLD_CPUS']) config['env']['WORLD_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_bump_hello_cpus(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) config['env']['HELLO_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'hello', hello_ids) check_running()
def test_bump_data_nodes(): check_healthy() data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['DATA_COUNT']) + 1 config['env']['DATA_COUNT'] = str(node_count) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_healthy(DEFAULT_TASK_COUNT + 1) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(node_count) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_bump_journal_cpus(): check_healthy() journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') print('journal ids: ' + str(journal_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) cpus = float(config['env']['JOURNAL_CPUS']) config['env']['JOURNAL_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy()
def restart_app(app_name): log.info("Restarting {}...".format(app_name)) response = sdk_cmd.request('post', api_url('apps/{}/restart'.format(app_name))) log.info(response) assert response.ok log.info("Restarted {}.".format(app_name))
def test_tls_basic_artifacts(hello_world_service): task_id = sdk_tasks.get_task_ids(PACKAGE_NAME, 'artifacts')[0] assert task_id # Load end-entity certificate from keystore and root CA cert from truststore end_entity_cert = x509.load_pem_x509_certificate( task_exec(task_id, 'cat secure-tls-pod.crt').encode('ascii'), DEFAULT_BACKEND) root_ca_cert_in_truststore = _export_cert_from_task_keystore( task_id, 'keystore.truststore', 'dcos-root') # Check that certificate subject maches the service name common_name = end_entity_cert.subject.get_attributes_for_oid( NameOID.COMMON_NAME)[0].value assert common_name in sdk_hosts.autoip_host(PACKAGE_NAME, 'artifacts-0-node') san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = san_extension.value._general_names._general_names assert len(sans) == 1 cluster_root_ca_cert = x509.load_pem_x509_certificate( sdk_cmd.request( 'get', shakedown.dcos_url_path('/ca/dcos-ca.crt')).content, DEFAULT_BACKEND) assert root_ca_cert_in_truststore.signature == cluster_root_ca_cert.signature
def test_tls_basic_artifacts(hello_world_service): task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'artifacts')[0] assert task_id # Load end-entity certificate from keystore and root CA cert from truststore end_entity_cert = x509.load_pem_x509_certificate( task_exec(task_id, 'cat secure-tls-pod.crt').encode('ascii'), DEFAULT_BACKEND) root_ca_cert_in_truststore = _export_cert_from_task_keystore( task_id, 'keystore.truststore', 'dcos-root') # Check that certificate subject maches the service name common_name = end_entity_cert.subject.get_attributes_for_oid( NameOID.COMMON_NAME)[0].value assert common_name in sdk_hosts.autoip_host(config.SERVICE_NAME, 'artifacts-0-node') san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = san_extension.value._general_names._general_names assert len(sans) == 1 cluster_root_ca_cert = x509.load_pem_x509_certificate( sdk_cmd.request( 'get', shakedown.dcos_url_path('/ca/dcos-ca.crt')).content, DEFAULT_BACKEND) assert root_ca_cert_in_truststore.signature == cluster_root_ca_cert.signature
def test_modify_app_config(): check_healthy() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # All tasks should be updated because hdfs-site.xml has changed tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids) check_healthy()
def test_modify_app_config_rollback(): check_healthy() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) old_config = marathon.get_config(PACKAGE_NAME) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) print('expiry ms: ' + str(expiry_ms)) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # Wait for journal nodes to be affected by the change tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') print('old config: ') print(old_config) # Put the old config back (rollback) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=old_config) # Wait for the journal nodes to return to their old configuration tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy() config = marathon.get_config(PACKAGE_NAME) assert int(config['env'][app_config_field]) == expiry_ms # ZKFC and Data tasks should not have been affected tasks.check_tasks_not_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' check_running() task_ids = tasks.get_task_ids(PACKAGE_NAME, '') # caching enabled by default: stdout = cmd.run_cli('hello-world state refresh_cache') assert "Received cmd: refresh" in stdout config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): def check_cache_refresh_fails_409conflict(): try: cmd.run_cli('hello-world state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False spin.time_wait_noisy(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) del config['env']['DISABLE_STATE_CACHE'] cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): def check_cache_refresh(): return cmd.run_cli('hello-world state refresh_cache') stdout = spin.time_wait_return(lambda: check_cache_refresh(), timeout_seconds=120.) assert "Received cmd: refresh" in stdout
def get_metrics(package_name, service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] # TODO: uncomment the following block of comments when the /containers endpoint reports the correct container IDs # and remove the code following the comments that gets the correct container ID via 'pod info' ## Fetch the list of containers for the agent #containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format(shakedown.dcos_url(), agent_id) #containers_response = sdk_cmd.request("GET", containers_url, retry=False) #if containers_response.ok is None: # log.info("Unable to fetch containers list") # raise Exception("Unable to fetch containers list: {}".format(containers_url)) # instead of receiving the pod name in this function's parameter list, extract # the name of the pod from the task name to not break the code when the # above comment-block is uncommented pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] container_id = task_info["status"]["containerStatus"]["containerId"]["value"] #for container_id in json.loads(containers_response.text): app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, container_id) app_response = sdk_cmd.request("GET", app_url, retry=False) if app_response.ok is None: raise("Failed to get metrics from container") #continue app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def update_app(app_name, config, timeout=600): if "env" in config: log.info("Environment for marathon app {} ({} values):".format(app_name, len(config["env"]))) for k in sorted(config["env"]): log.info(" {}={}".format(k, config["env"][k])) response = sdk_cmd.request('put', api_url('apps/{}'.format(app_name)), log_args=False, json=config) assert response.ok, "Marathon configuration update failed for {} with config {}".format(app_name, config) log.info("Waiting for Marathon deployment of {} to complete...".format(app_name)) shakedown.deployment_wait(app_id=app_name, timeout=timeout)
def update_app(app_name, config, timeout=TIMEOUT_SECONDS, wait_for_completed_deployment=True): if "env" in config: log.info("Environment for marathon app {} ({} values):".format(app_name, len(config["env"]))) for k in sorted(config["env"]): log.info(" {}={}".format(k, config["env"][k])) response = sdk_cmd.request('put', api_url('apps/{}'.format(app_name)), log_args=False, json=config) assert response.ok, "Marathon configuration update failed for {} with config {}".format(app_name, config) if wait_for_completed_deployment: log.info("Waiting for Marathon deployment of {} to complete...".format(app_name)) shakedown.deployment_wait(app_id=app_name, timeout=timeout)
def get_metrics(service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] # Fetch the list of containers for the agent containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format( shakedown.dcos_url(), agent_id) containers_response = cmd.request("GET", containers_url, retry=False) if containers_response.ok is None: log.info("Unable to fetch containers list") raise Exception( "Unable to fetch containers list: {}".format(containers_url)) for container in json.loads(containers_response.text): app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, container) app_response = cmd.request("GET", app_url, retry=False) if app_response.ok is None: continue app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least {}s'.format( wait_time)) # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False try: spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time) except shakedown.TimeoutExpired: print('Timeout reached as expected') # add the needed envvars in marathon and confirm that the deployment succeeds: config = marathon.get_config(PACKAGE_NAME) env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_running()
def _get_master_public_ip() -> str: """ :return (str): The public IP of the master node in the DC/OS cluster. """ dcos_url, headers = sdk_security.get_dcos_credentials() cluster_metadata_url = "{cluster_url}/metadata".format(cluster_url=dcos_url) response = sdk_cmd.request("GET", cluster_metadata_url, verify=False) if not response.ok: raise RuntimeError("Unable to get the master node's public IP address: {err}".format(err=repr(response))) response = response.json() if "PUBLIC_IPV4" not in response: raise KeyError("Cluster metadata does not include master's public ip: {response}".format( response=repr(response))) public_ip = response["PUBLIC_IPV4"] log.info("Master public ip is {public_ip}".format(public_ip=public_ip)) return public_ip
def test_httpd(): cmd.request('get', '{}/pyhttpsd'.format(shakedown.dcos_service_url('proxylite')))
def _get_config_once(app_name): return sdk_cmd.request('get', api_url('apps/{}'.format(app_name)), retry=False, log_args=False)
def test_google(): cmd.request('get', '{}/google'.format(shakedown.dcos_service_url('proxylite')))
def fn(): return sdk_cmd.request('get', api_url('apps/{}'.format(app_name)), retry=False)
def update_app(app_name, config): response = sdk_cmd.request('put', api_url('apps/{}'.format(app_name)), json=config) assert response.ok, "Marathon configuration update failed for {} with config {}".format( app_name, config)
def get_metrics(package_name, service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] task_container_id = task_info["status"]["containerStatus"]["containerId"][ "value"] # Not related to functionality but consuming this # endpoint to verify downstream integrity containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format( shakedown.dcos_url(), agent_id) containers_response = sdk_cmd.request("GET", containers_url, retry=False) if containers_response.ok is None: log.info("Unable to fetch containers list") raise Exception( "Unable to fetch containers list: {}".format(containers_url)) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True if not container_id_reported: raise ValueError( "The metrics /container endpoint returned {}, expecting {} to be returned as well" .format(reported_container_ids, task_container_id)) app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, task_container_id) app_response = sdk_cmd.request("GET", app_url, retry=False) if app_response.ok is None: raise ValueError("Failed to get metrics from container") app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def update_app(app_name, config): response = sdk_cmd.request('put', api_url('apps/{}'.format(app_name)), json=config) assert response.ok, "Marathon configuration update failed for {} with config {}".format(app_name, config)
def fn(): return sdk_cmd.request('get', api_url('apps/{}'.format(app_name)), retry=False, log_args=False)