def _dump_task_logs_for_agent(item: pytest.Item, agent_id: str, agent_tasks: List[_TaskEntry]) -> None: agent_executor_paths = sdk_cmd.cluster_request( "GET", "/slave/{}/files/debug".format(agent_id) ).json() task_byte_count = 0 for task_entry in agent_tasks: try: task_byte_count += _dump_task_logs_for_task( item, agent_id, agent_executor_paths, task_entry ) except Exception: log.exception("Failed to get logs for task {}".format(task_entry)) log.info( "Downloaded {} bytes of logs from {} tasks on agent {}".format( task_byte_count, len(agent_tasks), agent_id ) ) # fetch agent log separately due to its totally different fetch semantics vs the task/executor logs if "/slave/log" in agent_executor_paths: out_path = _setup_artifact_path(item, "agent_{}.log".format(agent_id)) stream = sdk_cmd.cluster_request( "GET", "/slave/{}/files/download?path=/slave/log".format(agent_id), stream=True ) with open(out_path, "wb") as f: for chunk in stream.iter_content(chunk_size=8192): f.write(chunk)
def _grant(user: str, acl: str, description: str, action: str) -> None: log.info( "Granting permission to {user} for {acl}/{action} ({description})".format( user=user, acl=acl, action=action, description=description ) ) # Create the ACL r = sdk_cmd.cluster_request( "PUT", "/acs/api/v1/acls/{acl}".format(acl=acl), raise_on_error=False, json={"description": description}, ) # 201=created, 409=already exists assert r.status_code in [201, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text) # Assign the user to the ACL r = sdk_cmd.cluster_request( "PUT", "/acs/api/v1/acls/{acl}/users/{user}/{action}".format(acl=acl, user=user, action=action), raise_on_error=False, ) # 204=success, 409=already exists assert r.status_code in [204, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text)
def _dump_task_logs_for_task(item: pytest.Item, agent_id: str, agent_executor_paths: dict, task_entry: _TaskEntry): executor_browse_path = _find_matching_executor_path(agent_executor_paths, task_entry) if not executor_browse_path: # Expected executor path was not found on this agent. Did Mesos move their files around again? log.warning('Unable to find any paths matching task {} in agent {}:\n {}'.format( task_entry, agent_id, '\n '.join(sorted(agent_executor_paths.keys())))) return # Fetch paths under the executor. executor_file_infos = sdk_cmd.cluster_request( 'GET', '/slave/{}/files/browse?path={}'.format(agent_id, executor_browse_path)).json() # Look at the executor's sandbox and check for a 'tasks/' directory. # If it has one (due to being a Default Executor), then also fetch file infos for <executor_path>/tasks/<task_id>/ task_file_infos = [] if task_entry.executor_id and task_entry.task_id: for file_info in executor_file_infos: if file_info['mode'].startswith('d') and file_info['path'].endswith('/tasks'): task_browse_path = os.path.join(executor_browse_path, 'tasks/{}/'.format(task_entry.task_id)) try: task_file_infos = sdk_cmd.cluster_request( 'GET', '/slave/{}/files/browse?path={}'.format(agent_id, task_browse_path)).json() except: log.exception('Failed to fetch task sandbox from presumed default executor') # Select all log files to be fetched from the above list. selected_file_infos = collections.OrderedDict() if task_file_infos: # Include 'task' and 'executor' annotations in filenames to differentiate between them: _select_log_files(item, task_entry.task_id, executor_file_infos, 'executor.', selected_file_infos) _select_log_files(item, task_entry.task_id, task_file_infos, 'task.', selected_file_infos) else: # No annotation needed: _select_log_files(item, task_entry.task_id, executor_file_infos, '', selected_file_infos) if not selected_file_infos: log.warning('Unable to find any stdout/stderr files in above paths for task {}'.format(task_entry)) return byte_count = sum([f['size'] for f in selected_file_infos.values()]) log.info('Downloading {} files ({} bytes) for task {}:{}'.format( len(selected_file_infos), byte_count, task_entry, ''.join(['\n {} ({} bytes)\n => {}'.format( file_info['path'], file_info['size'], path) for path, file_info in selected_file_infos.items()]))) # Fetch files for out_path, file_info in selected_file_infos.items(): try: stream = sdk_cmd.cluster_request( 'GET', '/slave/{}/files/download?path={}'.format(agent_id, file_info['path']), stream=True) with open(out_path, 'wb') as f: for chunk in stream.iter_content(chunk_size=8192): f.write(chunk) except: log.exception('Failed to get file for task {}: {}'.format(task_entry, file_info)) return byte_count
def get_metrics(package_name, service_name, task_name): """Return a list of DC/OS metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] task_container_id = task_info["status"]["containerStatus"]["containerId"]["value"] # Not related to functionality but consuming this # endpoint to verify downstream integrity containers_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers".format(agent_id), retry=False) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True if not container_id_reported: raise ValueError("The metrics /container endpoint returned {}, expecting {} to be returned as well".format( reported_container_ids, task_container_id)) app_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(agent_id, task_container_id), retry=False) app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def update_app(app_name, config, timeout=TIMEOUT_SECONDS, wait_for_completed_deployment=True, force=True): if "env" in config: log.info("Environment for marathon app {} ({} values):".format(app_name, len(config["env"]))) for k in sorted(config["env"]): log.info(" {}={}".format(k, config["env"][k])) query_string = "?force=true" if force else "" # throws on failure: sdk_cmd.cluster_request('PUT', _api_url('apps/{}{}'.format(app_name, query_string)), log_args=False, json=config) if wait_for_completed_deployment: log.info("Waiting for Marathon deployment of {} to complete...".format(app_name)) shakedown.deployment_wait(app_id=app_name, timeout=timeout)
def get_all_status_history(task_name: str, with_completed_tasks: bool = True) -> list: """Returns a list of task status values(of the form 'TASK_STARTING', 'TASK_KILLED', etc) for all instances of a given task. The returned values are ordered chronologically from first to last. : param task_name: The name of the task whose history should be retrieved. : param with_completed_tasks: Whether to include the status history of previous versions of the task which had since exited. Unlike with get_service_tasks(), this may include tasks from previous versions of the service. """ cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()["tasks"] statuses: List[Dict[str, Any]] = [] for cluster_task in cluster_tasks: if cluster_task["name"] != task_name: # Skip task: wrong name continue if not with_completed_tasks and cluster_task["state"] in COMPLETED_TASK_STATES: # Skip task: task instance is completed and we don't want completed tasks continue statuses += cluster_task["statuses"] history = [s for s in sorted(statuses, key=lambda x: x["timestamp"])] log.info( "Status history for task {} (with_completed={}): {}".format( task_name, with_completed_tasks, ", ".join([s["state"] for s in history]) ) ) return history
def download_agent_path( agent_id: str, agent_file_path: str, output_file_path: str, chunk_size: int = 8192 ) -> None: stream = sdk_cmd.cluster_request( "GET", "/slave/{}/files/download?path={}".format(agent_id, agent_file_path), retry=False, raise_on_error=False, log_response=False, stream=True, ) if is_http_server_error(stream.status_code): raise Exception(stream) if stream.status_code == 404: return if not stream.ok: # Retry. raise Exception(stream) with open(output_file_path, "wb") as f: for chunk in stream.iter_content(chunk_size=chunk_size): f.write(chunk)
def fetch_dcos_ca_bundle_contents() -> str: resp = sdk_cmd.cluster_request("GET", "/ca/dcos-ca.crt") cert = resp.content if not cert: log.error("Error fetching DC/OS CA bundle") raise Exception("Errot fetching DC/OS CA bundle") return cert
def get_task_host(task): agent_id = task["slave_id"] log.info("Retrieving agents information for {}".format(agent_id)) agents = sdk_cmd.cluster_request("GET", "/mesos/slaves?slave_id={}".format(agent_id)).json() assert ( len(agents["slaves"]) == 1 ), "Agent's details do not match the expectations for agent ID {}".format(agent_id) return agents["slaves"][0]["hostname"]
def fetch_dcos_ca_bundle_contents() -> bytes: resp = sdk_cmd.cluster_request("GET", "/ca/dcos-ca.crt") cert = resp.content if not cert: log.error("Error fetching DC/OS CA bundle") raise Exception("Errot fetching DC/OS CA bundle") return cert
def dcos_ca_bundle(): """ Retrieve DC/OS CA bundle and returns the content. """ resp = sdk_cmd.cluster_request('GET', '/ca/dcos-ca.crt') cert = resp.content.decode('ascii') assert cert is not None return cert
def _dump_mesos_state(item: pytest.Item): '''Downloads state from the Mesos master and saves it to the artifact path for this test.''' for name in ['state.json', 'slaves']: r = sdk_cmd.cluster_request('GET', '/mesos/{}'.format(name), verify=False, raise_on_error=False) if r.ok: if name.endswith('.json'): name = name[:-len('.json')] # avoid duplicate '.json' with open(_setup_artifact_path(item, 'mesos_{}.json'.format(name)), 'w') as f: f.write(r.text)
def _dump_mesos_state(item: pytest.Item) -> None: """Downloads state from the Mesos master and saves it to the artifact path for this test.""" for name in ["state.json", "slaves"]: r = sdk_cmd.cluster_request("GET", "/mesos/{}".format(name), raise_on_error=False) if r.ok: if name.endswith(".json"): name = name[: -len(".json")] # avoid duplicate '.json' with open(_setup_artifact_path(item, "mesos_{}.json".format(name)), "w") as f: f.write(r.text)
def _grant(user: str, acl: str, description: str, action: str="create") -> None: log.info('Granting permission to {user} for {acl}/{action} ({description})'.format( user=user, acl=acl, action=action, description=description)) # Create the ACL r = sdk_cmd.cluster_request( 'PUT', '/acs/api/v1/acls/{acl}'.format(acl=acl), raise_on_error=False, json={'description': description}) # 201=created, 409=already exists assert r.status_code in [201, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text) # Assign the user to the ACL r = sdk_cmd.cluster_request( 'PUT', '/acs/api/v1/acls/{acl}/users/{user}/{action}'.format(acl=acl, user=user, action=action), raise_on_error=False) # 204=success, 409=already exists assert r.status_code in [204, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text)
def _app_exists(): response = sdk_cmd.cluster_request("GET", _api_url( "apps/{}".format(app_name)), raise_on_error=False) if response.status_code == 404: return False # app doesn't exist response.raise_for_status() # throw exception for (non-404) errors return True # didn't get 404, and no other error code was returned, so app must exist.
def _dump_task_logs_for_agent(item: pytest.Item, agent_id: str, agent_tasks: list): agent_executor_paths = sdk_cmd.cluster_request('GET', '/slave/{}/files/debug'.format(agent_id)).json() task_byte_count = 0 for task_entry in agent_tasks: try: task_byte_count += _dump_task_logs_for_task(item, agent_id, agent_executor_paths, task_entry) except Exception: log.exception('Failed to get logs for task {}'.format(task_entry)) log.info('Downloaded {} bytes of logs from {} tasks on agent {}'.format( task_byte_count, len(agent_tasks), agent_id)) # fetch agent log separately due to its totally different fetch semantics vs the task/executor logs if '/slave/log' in agent_executor_paths: out_path = _setup_artifact_path(item, 'agent_{}.log'.format(agent_id)) stream = sdk_cmd.cluster_request( 'GET', '/slave/{}/files/download?path=/slave/log'.format(agent_id), stream=True) with open(out_path, 'wb') as f: for chunk in stream.iter_content(chunk_size=8192): f.write(chunk)
def get_summary(with_completed=False): '''Returns a summary of task information as returned by the DC/OS CLI. This may be used instead of invoking 'dcos task [--all]' directly. Returns a list of Task objects. ''' cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json() cluster_agents = sdk_cmd.cluster_request('GET', '/mesos/slaves').json() all_tasks = [ Task.parse(entry, cluster_agents) for entry in cluster_tasks['tasks'] ] if with_completed: output = all_tasks else: output = list( filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks)) log.info('Task summary (with_completed={}):\n- {}'.format( with_completed, '\n- '.join([str(e) for e in output]))) return output
def _update(): response = sdk_cmd.cluster_request( "PUT", _api_url("apps/{}".format(app_name)), params={"force": "true"} if force else {}, json=config, log_args=False, raise_on_error=False, ) return _handle_marathon_deployment_response(response)
def test_nodes_deploy_to_local_region_by_default(configure_universe, local_service): # Fetch master's region name: this is defined to be the local region local_region = sdk_cmd.cluster_request("GET", "/mesos/state").json()["domain"]["fault_domain"][ "region" ]["name"] for pod_name in POD_NAMES: pod_region = get_pod_region(config.SERVICE_NAME, pod_name) assert pod_region == local_region
def _install() -> MarathonDeploymentResponse: response = sdk_cmd.cluster_request( "POST", _api_url("apps"), json=app_definition, log_args=False, raise_on_error=False ) try: deployment_response = MarathonDeploymentResponse(response) except requests.HTTPError as e: if e.response.status_code == 409: # App exists already, left over from previous run? Delete and try again. destroy_app(app_name, timeout=timeout) raise e return deployment_response
def get_overlay_subnet(network_name='dcos'): subnet = None network_info = sdk_cmd.cluster_request( "GET", "/mesos/overlay-master/state").json() for network in network_info['network']['overlays']: if network['name'] == network_name: subnet = network['subnet'] break assert subnet is not None, "Unable to find subnet information for provided network name: {}".format( network_name) return subnet
def get_overlay_subnet(network_name="dcos"): subnet = None network_info = sdk_cmd.cluster_request("GET", "/mesos/overlay-master/state").json() for network in network_info["network"]["overlays"]: if network["name"] == network_name: subnet = network["subnet"] break assert ( subnet is not None ), "Unable to find subnet information for provided network name: {}".format(network_name) return subnet
def dump_mesos_state(item: pytest.Item): for name in ['state.json', 'slaves']: r = sdk_cmd.cluster_request('GET', '/mesos/{}'.format(name), verify=False, raise_on_error=False) if r.ok: if name.endswith('.json'): name = name[:-len('.json')] # avoid duplicate '.json' with open(setup_artifact_path(item, 'mesos_{}.json'.format(name)), 'w') as f: f.write(r.text)
def _get_master_public_ip() -> str: """ :return (str): The public IP of the master node in the DC/OS cluster. """ response = sdk_cmd.cluster_request("GET", "/metadata", verify=False).json() if "PUBLIC_IPV4" not in response: raise KeyError("Cluster metadata does not include master's public ip: {response}".format( response=response)) public_ip = response["PUBLIC_IPV4"] log.info("Master public ip is {public_ip}".format(public_ip=public_ip)) return public_ip
def _wait_for_active_framework() -> bool: return ( len( list( filter( lambda fwk: fwk["name"] == service_name and fwk["active"], sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"], ) ) ) > 0 )
def get_failed_task_count(service_name: str, retry: bool = False) -> int: history_response = sdk_cmd.cluster_request( "GET", "/dcos-history-service/history/last", retry=retry ) history_response.raise_for_status() history = history_response.json() service_history = [h for h in history["frameworks"] if h.get("name") == service_name] if not service_history: return 0 assert len(service_history) == 1 return sum(service_history[0].get(status, 0) for status in FATAL_TERMINAL_TASK_STATES)
def get_status_history(task_name: str) -> list: '''Returns a list of task status values (of the form 'TASK_STARTING', 'TASK_KILLED', etc) for a given task. The returned values are ordered chronologically from first to last. ''' cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json() statuses = [] for cluster_task in cluster_tasks['tasks']: if cluster_task['name'] != task_name: continue statuses += cluster_task['statuses'] history = [entry['state'] for entry in sorted(statuses, key=lambda x: x['timestamp'])] log.info('Status history for task {}: {}'.format(task_name, ', '.join(history))) return history
def filter_apps_by_id(filter_id, mom=None): """Return all Marathon apps with an ID matching `filter_id`. "jenkins" will return all Marathon apps that begin with "jenkins". Args: filter_id: String to filter Marathon app IDs Returns: Marathon response """ filter_url = _api_url('apps/?id={}'.format(filter_id), mom) return sdk_cmd.cluster_request('GET', filter_url, retry=False)
def _copy_file_to_localhost(host_id: str, keytab_absolute_path: str, output_filename: str): """ Copies the keytab that was generated inside the container running the KDC server to the localhost so it can be uploaded to the secret store later. """ log.info("Downloading keytab to %s", output_filename) keytab_response = sdk_cmd.cluster_request( 'GET', "/slave/{}/files/download".format(host_id), params={"path": keytab_absolute_path}) with open(output_filename, 'wb') as fd: for chunk in keytab_response.iter_content(chunk_size=128): fd.write(chunk) log.info("Downloaded %d bytes to %s", os.stat(output_filename).st_size, output_filename)
def update_app(app_name, config, timeout=TIMEOUT_SECONDS, wait_for_completed_deployment=True, force=True): if "env" in config: log.info("Environment for marathon app {} ({} values):".format( app_name, len(config["env"]))) for k in sorted(config["env"]): log.info(" {}={}".format(k, config["env"][k])) query_string = "?force=true" if force else "" # throws on failure: sdk_cmd.cluster_request('PUT', _api_url('apps/{}{}'.format( app_name, query_string)), log_args=False, json=config) if wait_for_completed_deployment: log.info("Waiting for Marathon deployment of {} to complete...".format( app_name)) shakedown.deployment_wait(app_id=app_name, timeout=timeout)
def __kdc_api(self, method: str, action: str, json: dict) -> dict: """ Invokes a KDC API command to the remote endpoint :param method: 'get' or 'post' :param url: The API action to perform :param request: The JSON payload to send :return (dict): :raises a generic Exception if the invocation fails. """ url = "{}/api/{}".format(self.get_service_path(), action) log.info("Performing KDC API {method} query to: {url}".format(method=method, url=url)) return sdk_cmd.cluster_request( method, url, headers={"content-type": "application/json"}, json=json )
def test_dispatcher_task_stdout(setup_spark): task_id = service_name.lstrip("/").replace("/", "_") task = sdk_cmd._get_task_info(task_id) if not task: raise Exception("Failed to get '{}' task".format(task_id)) task_sandbox_path = sdk_cmd.get_task_sandbox_path(task_id) if not task_sandbox_path: raise Exception("Failed to get '{}' sandbox path".format(task_id)) agent_id = task["slave_id"] task_sandbox = sdk_cmd.cluster_request( "GET", "/slave/{}/files/browse?path={}".format(agent_id, task_sandbox_path) ).json() stdout_file = [f for f in task_sandbox if f["path"].endswith("/stdout")][0] assert stdout_file["size"] > 0, "stdout file should have content"
def get_framework_json(framework_name, completed=True): response = sdk_cmd.cluster_request("GET", "/mesos/frameworks").json() if completed: frameworks = sorted(response["completed_frameworks"], key=lambda x: x['unregistered_time'], reverse=True) else: frameworks = response["frameworks"] for framework in frameworks: if framework["name"] == framework_name: return framework raise AssertionError( "Framework with name '{}' not found".format(framework_name))
def _dump_task_logs(item: pytest.Item, task_ids: list): '''For all of the provided tasks, downloads their task, executor, and agent logs to the artifact path for this test.''' task_ids_set = set(task_ids) cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json() matching_tasks_by_agent = {} for cluster_task in cluster_tasks['tasks']: task_entry = _TaskEntry(cluster_task) if task_entry.task_id in task_ids_set: agent_tasks = matching_tasks_by_agent.get(task_entry.agent_id, []) agent_tasks.append(task_entry) matching_tasks_by_agent[task_entry.agent_id] = agent_tasks for agent_id, agent_tasks in matching_tasks_by_agent.items(): try: _dump_task_logs_for_agent(item, agent_id, agent_tasks) except: log.exception('Failed to get logs for agent {}'.format(agent_id))
def get_status_history(task_name: str) -> list: '''Returns a list of task status values (of the form 'TASK_STARTING', 'TASK_KILLED', etc) for a given task. The returned values are ordered chronologically from first to last. ''' cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json() statuses = [] for cluster_task in cluster_tasks['tasks']: if cluster_task['name'] != task_name: continue statuses += cluster_task['statuses'] history = [ entry['state'] for entry in sorted(statuses, key=lambda x: x['timestamp']) ] log.info('Status history for task {}: {}'.format(task_name, ', '.join(history))) return history
def wait_for_unresponsive_agent(): try: response = sdk_cmd.cluster_request("GET", "/mesos/slaves", retry=False).json() agent_statuses = {} for agent in response["slaves"]: agent_statuses[agent["hostname"]] = agent["active"] log.info("Wait for {}=False: {}".format(agent_host, agent_statuses)) # If no agents were found, try again if len(agent_statuses) == 0: return True # If other agents are listed, but not OUR agent, assume that OUR agent is now inactive. # (Shouldn't happen, but just in case...) return agent_statuses.get(agent_host, False) except Exception as e: log.info(e) log.info(traceback.format_exc()) # Try again. Wait for the ip to be definitively inactive. return True
def wait_for_unresponsive_agent() -> bool: try: response = sdk_cmd.cluster_request("GET", "/mesos/slaves", retry=False).json() agent_statuses = {} for agent in response["slaves"]: agent_statuses[agent["hostname"]] = agent["active"] log.info("Wait for {}=False: {}".format(agent_host, agent_statuses)) # If no agents were found, try again if len(agent_statuses) == 0: return True # If other agents are listed, but not OUR agent, assume that OUR agent is now inactive. # (Shouldn't happen, but just in case...) return agent_statuses.get(agent_host, False) except Exception as e: log.info(e) log.info(traceback.format_exc()) # Try again. Wait for the ip to be definitively inactive. return True
def debug_agent_files(agent_id: str) -> List[str]: response = sdk_cmd.cluster_request( "GET", "/slave/{}/files/debug".format(agent_id), retry=False, raise_on_error=False, log_response=False, ) if is_http_server_error(response.status_code): # Retry. raise Exception(response) if not response.ok: # Retry. raise Exception(response) return response.json()
def _dump_task_logs(item: pytest.Item, task_ids: List[str]) -> None: """ For all of the provided tasks, downloads their task, executor, and agent logs to the artifact path for this test. """ task_ids_set = set(task_ids) cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json() matching_tasks_by_agent: Dict[str, List[_TaskEntry]] = {} for cluster_task in cluster_tasks["tasks"]: task_entry = _TaskEntry(cluster_task) if task_entry.task_id in task_ids_set: agent_tasks = matching_tasks_by_agent.get(task_entry.agent_id, []) agent_tasks.append(task_entry) matching_tasks_by_agent[task_entry.agent_id] = agent_tasks for agent_id, agent_tasks in matching_tasks_by_agent.items(): try: _dump_task_logs_for_agent(item, agent_id, agent_tasks) except Exception: log.exception("Failed to get logs for agent {}".format(agent_id))
def get_summary(with_completed=False, task_name=None) -> list: """Returns a summary of all cluster tasks in the cluster, or just a specified task. This may be used instead of invoking 'dcos task [--all]' directly. Returns a list of Task objects. """ cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()["tasks"] agentid_to_hostname = _get_agentid_to_hostname() all_tasks = [ Task.parse(entry, agentid_to_hostname) for entry in cluster_tasks ] output = (all_tasks if with_completed else list( filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks))) if task_name: output = list(filter(lambda t: t.name == task_name, output)) log.info("Task summary (with_completed={}, task_name=[{}]):\n- {}".format( with_completed, task_name, "\n- ".join([str(e) for e in output]))) return output
def debug_agent_files(agent_id: str) -> List[str]: response = sdk_cmd.cluster_request( "GET", "/slave/{}/files/debug".format(agent_id), retry=False, raise_on_error=False, log_response=False, ) if is_http_server_error(response.status_code): # Retry. raise Exception(response) if response.status_code == 404: return {} if not response.ok: # Retry. raise Exception(response) return response.json()
def browse_agent_path(agent_id: str, agent_path: str) -> List[dict]: response = sdk_cmd.cluster_request( "GET", "/slave/{}/files/browse?path={}".format(agent_id, agent_path), retry=False, raise_on_error=False, log_response=False, ) if is_http_server_error(response.status_code): # Retry. raise Exception(response) if response.status_code == 404: return [] if not response.ok: # Retry. raise Exception(response) return response.json()
def get_summary(with_completed: bool = False, task_name: Optional[str] = None) -> List[Task]: """Returns a summary of all cluster tasks in the cluster, or just a specified task. This may be used instead of invoking 'dcos task [--all]' directly. Returns a list of Task objects. """ cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()["tasks"] agentid_to_hostname = _get_agentid_to_hostname() all_tasks = [Task.parse(entry, agentid_to_hostname) for entry in cluster_tasks] output = ( all_tasks if with_completed else list(filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks)) ) if task_name: output = list(filter(lambda t: t.name == task_name, output)) log.info( "Task summary (with_completed={}, task_name=[{}]):\n- {}".format( with_completed, task_name, "\n- ".join([str(e) for e in output]) ) ) return output
def _get_service_tasks( service_name: str, agentid_to_hostname: dict, task_prefix: str = "", with_completed_tasks: bool = False, ) -> list: """Returns a summary of all tasks in the specified Mesos framework. Returns a list of Task objects. """ cluster_frameworks = sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"] service_tasks: List[Task] = [] for fwk in cluster_frameworks: if not fwk["name"] == service_name or not fwk["active"]: continue service_tasks += [Task.parse(entry, agentid_to_hostname) for entry in fwk["tasks"]] if with_completed_tasks: service_tasks += [ Task.parse(entry, agentid_to_hostname) for entry in fwk["completed_tasks"] ] if task_prefix: service_tasks = [t for t in service_tasks if t.name.startswith(task_prefix)] return service_tasks
def _get_config_once(app_name): return sdk_cmd.cluster_request('GET', _api_url('apps/{}'.format(app_name)), retry=False)
def restart_app(app_name): log.info("Restarting {}...".format(app_name)) # throws on failure: sdk_cmd.cluster_request('POST', _api_url('apps/{}/restart'.format(app_name))) log.info("Restarted {}.".format(app_name))
def dcos_version() -> str: response = sdk_cmd.cluster_request("GET", "/dcos-metadata/dcos-version.json") response_json = response.json() return str(response_json["version"])
def get_zk_node_data(node_name): return sdk_cmd.cluster_request( "GET", "/exhibitor/exhibitor/v1/explorer/node-data?key={}".format(node_name) ).json()
def get_agents() -> List[Dict[str, Any]]: response = sdk_cmd.cluster_request("GET", "/mesos/slaves") response_json = response.json() return list(response_json["slaves"])
def get_metadata() -> requests.Response: return sdk_cmd.cluster_request('GET', 'dcos-metadata/bootstrap-config.json', retry=False)
def _wait_for_active_framework() -> bool: return len(list(filter( lambda fwk: fwk["name"] == service_name and fwk["active"], sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"] ))) > 0