Example #1
0
def _dump_task_logs_for_agent(item: pytest.Item, agent_id: str, agent_tasks: List[_TaskEntry]) -> None:
    agent_executor_paths = sdk_cmd.cluster_request(
        "GET", "/slave/{}/files/debug".format(agent_id)
    ).json()
    task_byte_count = 0
    for task_entry in agent_tasks:
        try:
            task_byte_count += _dump_task_logs_for_task(
                item, agent_id, agent_executor_paths, task_entry
            )
        except Exception:
            log.exception("Failed to get logs for task {}".format(task_entry))
    log.info(
        "Downloaded {} bytes of logs from {} tasks on agent {}".format(
            task_byte_count, len(agent_tasks), agent_id
        )
    )

    # fetch agent log separately due to its totally different fetch semantics vs the task/executor logs
    if "/slave/log" in agent_executor_paths:
        out_path = _setup_artifact_path(item, "agent_{}.log".format(agent_id))
        stream = sdk_cmd.cluster_request(
            "GET", "/slave/{}/files/download?path=/slave/log".format(agent_id), stream=True
        )
        with open(out_path, "wb") as f:
            for chunk in stream.iter_content(chunk_size=8192):
                f.write(chunk)
Example #2
0
def _grant(user: str, acl: str, description: str, action: str) -> None:
    log.info(
        "Granting permission to {user} for {acl}/{action} ({description})".format(
            user=user, acl=acl, action=action, description=description
        )
    )

    # Create the ACL
    r = sdk_cmd.cluster_request(
        "PUT",
        "/acs/api/v1/acls/{acl}".format(acl=acl),
        raise_on_error=False,
        json={"description": description},
    )
    # 201=created, 409=already exists
    assert r.status_code in [201, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text)

    # Assign the user to the ACL
    r = sdk_cmd.cluster_request(
        "PUT",
        "/acs/api/v1/acls/{acl}/users/{user}/{action}".format(acl=acl, user=user, action=action),
        raise_on_error=False,
    )
    # 204=success, 409=already exists
    assert r.status_code in [204, 409], "{} failed {}: {}".format(r.url, r.status_code, r.text)
Example #3
0
def _dump_task_logs_for_task(item: pytest.Item, agent_id: str, agent_executor_paths: dict, task_entry: _TaskEntry):
    executor_browse_path = _find_matching_executor_path(agent_executor_paths, task_entry)
    if not executor_browse_path:
        # Expected executor path was not found on this agent. Did Mesos move their files around again?
        log.warning('Unable to find any paths matching task {} in agent {}:\n  {}'.format(
            task_entry, agent_id, '\n  '.join(sorted(agent_executor_paths.keys()))))
        return

    # Fetch paths under the executor.
    executor_file_infos = sdk_cmd.cluster_request(
        'GET', '/slave/{}/files/browse?path={}'.format(agent_id, executor_browse_path)).json()

    # Look at the executor's sandbox and check for a 'tasks/' directory.
    # If it has one (due to being a Default Executor), then also fetch file infos for <executor_path>/tasks/<task_id>/
    task_file_infos = []
    if task_entry.executor_id and task_entry.task_id:
        for file_info in executor_file_infos:
            if file_info['mode'].startswith('d') and file_info['path'].endswith('/tasks'):
                task_browse_path = os.path.join(executor_browse_path, 'tasks/{}/'.format(task_entry.task_id))
                try:
                    task_file_infos = sdk_cmd.cluster_request(
                        'GET', '/slave/{}/files/browse?path={}'.format(agent_id, task_browse_path)).json()
                except:
                    log.exception('Failed to fetch task sandbox from presumed default executor')

    # Select all log files to be fetched from the above list.
    selected_file_infos = collections.OrderedDict()
    if task_file_infos:
        # Include 'task' and 'executor' annotations in filenames to differentiate between them:
        _select_log_files(item, task_entry.task_id, executor_file_infos, 'executor.', selected_file_infos)
        _select_log_files(item, task_entry.task_id, task_file_infos, 'task.', selected_file_infos)
    else:
        # No annotation needed:
        _select_log_files(item, task_entry.task_id, executor_file_infos, '', selected_file_infos)
    if not selected_file_infos:
        log.warning('Unable to find any stdout/stderr files in above paths for task {}'.format(task_entry))
        return

    byte_count = sum([f['size'] for f in selected_file_infos.values()])
    log.info('Downloading {} files ({} bytes) for task {}:{}'.format(
        len(selected_file_infos),
        byte_count,
        task_entry,
        ''.join(['\n  {} ({} bytes)\n    => {}'.format(
            file_info['path'], file_info['size'], path) for path, file_info in selected_file_infos.items()])))

    # Fetch files
    for out_path, file_info in selected_file_infos.items():
        try:
            stream = sdk_cmd.cluster_request(
                'GET', '/slave/{}/files/download?path={}'.format(agent_id, file_info['path']), stream=True)
            with open(out_path, 'wb') as f:
                for chunk in stream.iter_content(chunk_size=8192):
                    f.write(chunk)
        except:
            log.exception('Failed to get file for task {}: {}'.format(task_entry, file_info))
    return byte_count
def get_metrics(package_name, service_name, task_name):
    """Return a list of DC/OS metrics datapoints.

    Keyword arguments:
    package_name -- the name of the package the service is using
    service_name -- the name of the service to get metrics for
    task_name -- the name of the task whose agent to run metrics commands from
    """
    tasks = shakedown.get_service_tasks(service_name)
    for task in tasks:
        if task['name'] == task_name:
            task_to_check = task

    if task_to_check is None:
        raise Exception("Could not find task")

    agent_id = task_to_check['slave_id']
    executor_id = task_to_check['executor_id']

    pod_name = '-'.join(task_name.split("-")[:2])
    pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True)
    task_info = None
    for task in pod_info:
        if task["info"]["name"] == task_name:
            task_info = task
            break

    if not task_info:
        return []

    task_container_id = task_info["status"]["containerStatus"]["containerId"]["value"]

    # Not related to functionality but consuming this
    # endpoint to verify downstream integrity
    containers_response = sdk_cmd.cluster_request(
        "GET", "/system/v1/agent/{}/metrics/v0/containers".format(agent_id), retry=False)
    reported_container_ids = json.loads(containers_response.text)

    container_id_reported = False
    for container_id in reported_container_ids:
        if container_id == task_container_id:
            container_id_reported = True

    if not container_id_reported:
        raise ValueError("The metrics /container endpoint returned {}, expecting {} to be returned as well".format(
            reported_container_ids, task_container_id))

    app_response = sdk_cmd.cluster_request(
        "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(agent_id, task_container_id), retry=False)
    app_json = json.loads(app_response.text)
    if app_json['dimensions']['executor_id'] == executor_id:
        return app_json['datapoints']

    raise Exception("No metrics found")
def update_app(app_name, config, timeout=TIMEOUT_SECONDS, wait_for_completed_deployment=True, force=True):
    if "env" in config:
        log.info("Environment for marathon app {} ({} values):".format(app_name, len(config["env"])))
        for k in sorted(config["env"]):
            log.info("  {}={}".format(k, config["env"][k]))

    query_string = "?force=true" if force else ""

    # throws on failure:
    sdk_cmd.cluster_request('PUT', _api_url('apps/{}{}'.format(app_name, query_string)), log_args=False, json=config)

    if wait_for_completed_deployment:
        log.info("Waiting for Marathon deployment of {} to complete...".format(app_name))
        shakedown.deployment_wait(app_id=app_name, timeout=timeout)
Example #6
0
def get_all_status_history(task_name: str, with_completed_tasks: bool = True) -> list:
    """Returns a list of task status values(of the form 'TASK_STARTING', 'TASK_KILLED', etc) for
    all instances of a given task. The returned values are ordered chronologically from first to
    last.

    : param task_name: The name of the task whose history should be retrieved.
    : param with_completed_tasks: Whether to include the status history of previous versions of the task which had since exited. Unlike with get_service_tasks(), this may include tasks from previous versions of the service.
    """
    cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()["tasks"]
    statuses: List[Dict[str, Any]] = []
    for cluster_task in cluster_tasks:
        if cluster_task["name"] != task_name:
            # Skip task: wrong name
            continue
        if not with_completed_tasks and cluster_task["state"] in COMPLETED_TASK_STATES:
            # Skip task: task instance is completed and we don't want completed tasks
            continue
        statuses += cluster_task["statuses"]
    history = [s for s in sorted(statuses, key=lambda x: x["timestamp"])]
    log.info(
        "Status history for task {} (with_completed={}): {}".format(
            task_name, with_completed_tasks, ", ".join([s["state"] for s in history])
        )
    )
    return history
Example #7
0
def download_agent_path(
    agent_id: str, agent_file_path: str, output_file_path: str, chunk_size: int = 8192
) -> None:
    stream = sdk_cmd.cluster_request(
        "GET",
        "/slave/{}/files/download?path={}".format(agent_id, agent_file_path),
        retry=False,
        raise_on_error=False,
        log_response=False,
        stream=True,
    )

    if is_http_server_error(stream.status_code):
        raise Exception(stream)

    if stream.status_code == 404:
        return

    if not stream.ok:
        # Retry.
        raise Exception(stream)

    with open(output_file_path, "wb") as f:
        for chunk in stream.iter_content(chunk_size=chunk_size):
            f.write(chunk)
Example #8
0
def fetch_dcos_ca_bundle_contents() -> str:
    resp = sdk_cmd.cluster_request("GET", "/ca/dcos-ca.crt")
    cert = resp.content
    if not cert:
        log.error("Error fetching DC/OS CA bundle")
        raise Exception("Errot fetching DC/OS CA bundle")

    return cert
Example #9
0
def get_task_host(task):
    agent_id = task["slave_id"]
    log.info("Retrieving agents information for {}".format(agent_id))
    agents = sdk_cmd.cluster_request("GET", "/mesos/slaves?slave_id={}".format(agent_id)).json()
    assert (
        len(agents["slaves"]) == 1
    ), "Agent's details do not match the expectations for agent ID {}".format(agent_id)
    return agents["slaves"][0]["hostname"]
def fetch_dcos_ca_bundle_contents() -> bytes:
    resp = sdk_cmd.cluster_request("GET", "/ca/dcos-ca.crt")
    cert = resp.content
    if not cert:
        log.error("Error fetching DC/OS CA bundle")
        raise Exception("Errot fetching DC/OS CA bundle")

    return cert
Example #11
0
def dcos_ca_bundle():
    """
    Retrieve DC/OS CA bundle and returns the content.
    """
    resp = sdk_cmd.cluster_request('GET', '/ca/dcos-ca.crt')
    cert = resp.content.decode('ascii')
    assert cert is not None
    return cert
Example #12
0
def dcos_ca_bundle():
    """
    Retrieve DC/OS CA bundle and returns the content.
    """
    resp = sdk_cmd.cluster_request('GET', '/ca/dcos-ca.crt')
    cert = resp.content.decode('ascii')
    assert cert is not None
    return cert
Example #13
0
def _dump_mesos_state(item: pytest.Item):
    '''Downloads state from the Mesos master and saves it to the artifact path for this test.'''
    for name in ['state.json', 'slaves']:
        r = sdk_cmd.cluster_request('GET', '/mesos/{}'.format(name), verify=False, raise_on_error=False)
        if r.ok:
            if name.endswith('.json'):
                name = name[:-len('.json')]  # avoid duplicate '.json'
            with open(_setup_artifact_path(item, 'mesos_{}.json'.format(name)), 'w') as f:
                f.write(r.text)
Example #14
0
def _dump_mesos_state(item: pytest.Item) -> None:
    """Downloads state from the Mesos master and saves it to the artifact path for this test."""
    for name in ["state.json", "slaves"]:
        r = sdk_cmd.cluster_request("GET", "/mesos/{}".format(name), raise_on_error=False)
        if r.ok:
            if name.endswith(".json"):
                name = name[: -len(".json")]  # avoid duplicate '.json'
            with open(_setup_artifact_path(item, "mesos_{}.json".format(name)), "w") as f:
                f.write(r.text)
Example #15
0
def _dump_mesos_state(item: pytest.Item):
    '''Downloads state from the Mesos master and saves it to the artifact path for this test.'''
    for name in ['state.json', 'slaves']:
        r = sdk_cmd.cluster_request('GET', '/mesos/{}'.format(name), verify=False, raise_on_error=False)
        if r.ok:
            if name.endswith('.json'):
                name = name[:-len('.json')] # avoid duplicate '.json'
            with open(_setup_artifact_path(item, 'mesos_{}.json'.format(name)), 'w') as f:
                f.write(r.text)
Example #16
0
def _grant(user: str, acl: str, description: str, action: str="create") -> None:
    log.info('Granting permission to {user} for {acl}/{action} ({description})'.format(
        user=user, acl=acl, action=action, description=description))

    # Create the ACL
    r = sdk_cmd.cluster_request(
        'PUT', '/acs/api/v1/acls/{acl}'.format(acl=acl),
        raise_on_error=False,
        json={'description': description})
    # 201=created, 409=already exists
    assert r.status_code in [201, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text)

    # Assign the user to the ACL
    r = sdk_cmd.cluster_request(
        'PUT', '/acs/api/v1/acls/{acl}/users/{user}/{action}'.format(acl=acl, user=user, action=action),
        raise_on_error=False)
    # 204=success, 409=already exists
    assert r.status_code in [204, 409, ], '{} failed {}: {}'.format(r.url, r.status_code, r.text)
 def _app_exists():
     response = sdk_cmd.cluster_request("GET",
                                        _api_url(
                                            "apps/{}".format(app_name)),
                                        raise_on_error=False)
     if response.status_code == 404:
         return False  # app doesn't exist
     response.raise_for_status()  # throw exception for (non-404) errors
     return True  # didn't get 404, and no other error code was returned, so app must exist.
Example #18
0
def _dump_task_logs_for_agent(item: pytest.Item, agent_id: str, agent_tasks: list):
    agent_executor_paths = sdk_cmd.cluster_request('GET', '/slave/{}/files/debug'.format(agent_id)).json()
    task_byte_count = 0
    for task_entry in agent_tasks:
        try:
            task_byte_count += _dump_task_logs_for_task(item, agent_id, agent_executor_paths, task_entry)
        except Exception:
            log.exception('Failed to get logs for task {}'.format(task_entry))
    log.info('Downloaded {} bytes of logs from {} tasks on agent {}'.format(
        task_byte_count, len(agent_tasks), agent_id))

    # fetch agent log separately due to its totally different fetch semantics vs the task/executor logs
    if '/slave/log' in agent_executor_paths:
        out_path = _setup_artifact_path(item, 'agent_{}.log'.format(agent_id))
        stream = sdk_cmd.cluster_request(
            'GET', '/slave/{}/files/download?path=/slave/log'.format(agent_id), stream=True)
        with open(out_path, 'wb') as f:
            for chunk in stream.iter_content(chunk_size=8192):
                f.write(chunk)
Example #19
0
def get_summary(with_completed=False):
    '''Returns a summary of task information as returned by the DC/OS CLI.
    This may be used instead of invoking 'dcos task [--all]' directly.

    Returns a list of Task objects.
    '''
    cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json()
    cluster_agents = sdk_cmd.cluster_request('GET', '/mesos/slaves').json()
    all_tasks = [
        Task.parse(entry, cluster_agents) for entry in cluster_tasks['tasks']
    ]
    if with_completed:
        output = all_tasks
    else:
        output = list(
            filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks))
    log.info('Task summary (with_completed={}):\n- {}'.format(
        with_completed, '\n- '.join([str(e) for e in output])))
    return output
 def _update():
     response = sdk_cmd.cluster_request(
         "PUT",
         _api_url("apps/{}".format(app_name)),
         params={"force": "true"} if force else {},
         json=config,
         log_args=False,
         raise_on_error=False,
     )
     return _handle_marathon_deployment_response(response)
Example #21
0
def test_nodes_deploy_to_local_region_by_default(configure_universe, local_service):
    # Fetch master's region name: this is defined to be the local region
    local_region = sdk_cmd.cluster_request("GET", "/mesos/state").json()["domain"]["fault_domain"][
        "region"
    ]["name"]

    for pod_name in POD_NAMES:
        pod_region = get_pod_region(config.SERVICE_NAME, pod_name)

        assert pod_region == local_region
 def _install() -> MarathonDeploymentResponse:
     response = sdk_cmd.cluster_request(
         "POST", _api_url("apps"), json=app_definition, log_args=False, raise_on_error=False
     )
     try:
         deployment_response = MarathonDeploymentResponse(response)
     except requests.HTTPError as e:
         if e.response.status_code == 409:
             # App exists already, left over from previous run? Delete and try again.
             destroy_app(app_name, timeout=timeout)
         raise e
     return deployment_response
Example #23
0
def get_overlay_subnet(network_name='dcos'):
    subnet = None
    network_info = sdk_cmd.cluster_request(
        "GET", "/mesos/overlay-master/state").json()
    for network in network_info['network']['overlays']:
        if network['name'] == network_name:
            subnet = network['subnet']
            break

    assert subnet is not None, "Unable to find subnet information for provided network name: {}".format(
        network_name)
    return subnet
Example #24
0
def get_overlay_subnet(network_name="dcos"):
    subnet = None
    network_info = sdk_cmd.cluster_request("GET", "/mesos/overlay-master/state").json()
    for network in network_info["network"]["overlays"]:
        if network["name"] == network_name:
            subnet = network["subnet"]
            break

    assert (
        subnet is not None
    ), "Unable to find subnet information for provided network name: {}".format(network_name)
    return subnet
Example #25
0
def dump_mesos_state(item: pytest.Item):
    for name in ['state.json', 'slaves']:
        r = sdk_cmd.cluster_request('GET',
                                    '/mesos/{}'.format(name),
                                    verify=False,
                                    raise_on_error=False)
        if r.ok:
            if name.endswith('.json'):
                name = name[:-len('.json')]  # avoid duplicate '.json'
            with open(setup_artifact_path(item, 'mesos_{}.json'.format(name)),
                      'w') as f:
                f.write(r.text)
Example #26
0
def _get_master_public_ip() -> str:
    """
    :return (str): The public IP of the master node in the DC/OS cluster.
    """
    response = sdk_cmd.cluster_request("GET", "/metadata", verify=False).json()
    if "PUBLIC_IPV4" not in response:
        raise KeyError("Cluster metadata does not include master's public ip: {response}".format(
            response=response))

    public_ip = response["PUBLIC_IPV4"]
    log.info("Master public ip is {public_ip}".format(public_ip=public_ip))
    return public_ip
Example #27
0
 def _wait_for_active_framework() -> bool:
     return (
         len(
             list(
                 filter(
                     lambda fwk: fwk["name"] == service_name and fwk["active"],
                     sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"],
                 )
             )
         )
         > 0
     )
Example #28
0
def get_failed_task_count(service_name: str, retry: bool = False) -> int:
    history_response = sdk_cmd.cluster_request(
        "GET", "/dcos-history-service/history/last", retry=retry
    )
    history_response.raise_for_status()
    history = history_response.json()
    service_history = [h for h in history["frameworks"] if h.get("name") == service_name]
    if not service_history:
        return 0

    assert len(service_history) == 1

    return sum(service_history[0].get(status, 0) for status in FATAL_TERMINAL_TASK_STATES)
Example #29
0
def get_failed_task_count(service_name: str, retry: bool = False) -> int:
    history_response = sdk_cmd.cluster_request(
        "GET", "/dcos-history-service/history/last", retry=retry
    )
    history_response.raise_for_status()
    history = history_response.json()
    service_history = [h for h in history["frameworks"] if h.get("name") == service_name]
    if not service_history:
        return 0

    assert len(service_history) == 1

    return sum(service_history[0].get(status, 0) for status in FATAL_TERMINAL_TASK_STATES)
Example #30
0
def get_status_history(task_name: str) -> list:
    '''Returns a list of task status values (of the form 'TASK_STARTING', 'TASK_KILLED', etc) for a given task.
    The returned values are ordered chronologically from first to last.
    '''
    cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json()
    statuses = []
    for cluster_task in cluster_tasks['tasks']:
        if cluster_task['name'] != task_name:
            continue
        statuses += cluster_task['statuses']
    history = [entry['state'] for entry in sorted(statuses, key=lambda x: x['timestamp'])]
    log.info('Status history for task {}: {}'.format(task_name, ', '.join(history)))
    return history
def filter_apps_by_id(filter_id, mom=None):
    """Return all Marathon apps with an ID matching `filter_id`.

    "jenkins" will return all Marathon apps that begin with "jenkins".

    Args:
        filter_id: String to filter Marathon app IDs

    Returns: Marathon response

    """
    filter_url = _api_url('apps/?id={}'.format(filter_id), mom)
    return sdk_cmd.cluster_request('GET', filter_url, retry=False)
Example #32
0
def _copy_file_to_localhost(host_id: str, keytab_absolute_path: str, output_filename: str):
    """
    Copies the keytab that was generated inside the container running the KDC server to the localhost
    so it can be uploaded to the secret store later.
    """
    log.info("Downloading keytab to %s", output_filename)

    keytab_response = sdk_cmd.cluster_request(
        'GET', "/slave/{}/files/download".format(host_id), params={"path": keytab_absolute_path})
    with open(output_filename, 'wb') as fd:
        for chunk in keytab_response.iter_content(chunk_size=128):
            fd.write(chunk)

    log.info("Downloaded %d bytes to %s", os.stat(output_filename).st_size, output_filename)
Example #33
0
def update_app(app_name,
               config,
               timeout=TIMEOUT_SECONDS,
               wait_for_completed_deployment=True,
               force=True):
    if "env" in config:
        log.info("Environment for marathon app {} ({} values):".format(
            app_name, len(config["env"])))
        for k in sorted(config["env"]):
            log.info("  {}={}".format(k, config["env"][k]))

    query_string = "?force=true" if force else ""

    # throws on failure:
    sdk_cmd.cluster_request('PUT',
                            _api_url('apps/{}{}'.format(
                                app_name, query_string)),
                            log_args=False,
                            json=config)

    if wait_for_completed_deployment:
        log.info("Waiting for Marathon deployment of {} to complete...".format(
            app_name))
        shakedown.deployment_wait(app_id=app_name, timeout=timeout)
Example #34
0
    def __kdc_api(self, method: str, action: str, json: dict) -> dict:
        """
        Invokes a KDC API command to the remote endpoint
        :param method: 'get' or 'post'
        :param url: The API action to perform
        :param request: The JSON payload to send
        :return (dict):
        :raises a generic Exception if the invocation fails.
        """
        url = "{}/api/{}".format(self.get_service_path(), action)
        log.info("Performing KDC API {method} query to: {url}".format(method=method, url=url))

        return sdk_cmd.cluster_request(
            method, url, headers={"content-type": "application/json"}, json=json
        )
def test_dispatcher_task_stdout(setup_spark):
    task_id = service_name.lstrip("/").replace("/", "_")
    task = sdk_cmd._get_task_info(task_id)
    if not task:
        raise Exception("Failed to get '{}' task".format(task_id))

    task_sandbox_path = sdk_cmd.get_task_sandbox_path(task_id)
    if not task_sandbox_path:
        raise Exception("Failed to get '{}' sandbox path".format(task_id))
    agent_id = task["slave_id"]

    task_sandbox = sdk_cmd.cluster_request(
        "GET", "/slave/{}/files/browse?path={}".format(agent_id, task_sandbox_path)
    ).json()
    stdout_file = [f for f in task_sandbox if f["path"].endswith("/stdout")][0]
    assert stdout_file["size"] > 0, "stdout file should have content"
Example #36
0
def get_framework_json(framework_name, completed=True):
    response = sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()

    if completed:
        frameworks = sorted(response["completed_frameworks"],
                            key=lambda x: x['unregistered_time'],
                            reverse=True)
    else:
        frameworks = response["frameworks"]

    for framework in frameworks:
        if framework["name"] == framework_name:
            return framework

    raise AssertionError(
        "Framework with name '{}' not found".format(framework_name))
Example #37
0
def _dump_task_logs(item: pytest.Item, task_ids: list):
    '''For all of the provided tasks, downloads their task, executor, and agent logs to the artifact path for this test.'''
    task_ids_set = set(task_ids)
    cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json()
    matching_tasks_by_agent = {}
    for cluster_task in cluster_tasks['tasks']:
        task_entry = _TaskEntry(cluster_task)
        if task_entry.task_id in task_ids_set:
            agent_tasks = matching_tasks_by_agent.get(task_entry.agent_id, [])
            agent_tasks.append(task_entry)
            matching_tasks_by_agent[task_entry.agent_id] = agent_tasks

    for agent_id, agent_tasks in matching_tasks_by_agent.items():
        try:
            _dump_task_logs_for_agent(item, agent_id, agent_tasks)
        except:
            log.exception('Failed to get logs for agent {}'.format(agent_id))
def _dump_task_logs(item: pytest.Item, task_ids: list):
    '''For all of the provided tasks, downloads their task, executor, and agent logs to the artifact path for this test.'''
    task_ids_set = set(task_ids)
    cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json()
    matching_tasks_by_agent = {}
    for cluster_task in cluster_tasks['tasks']:
        task_entry = _TaskEntry(cluster_task)
        if task_entry.task_id in task_ids_set:
            agent_tasks = matching_tasks_by_agent.get(task_entry.agent_id, [])
            agent_tasks.append(task_entry)
            matching_tasks_by_agent[task_entry.agent_id] = agent_tasks

    for agent_id, agent_tasks in matching_tasks_by_agent.items():
        try:
            _dump_task_logs_for_agent(item, agent_id, agent_tasks)
        except:
            log.exception('Failed to get logs for agent {}'.format(agent_id))
def get_status_history(task_name: str) -> list:
    '''Returns a list of task status values (of the form 'TASK_STARTING', 'TASK_KILLED', etc) for a given task.
    The returned values are ordered chronologically from first to last.
    '''
    cluster_tasks = sdk_cmd.cluster_request('GET', '/mesos/tasks').json()
    statuses = []
    for cluster_task in cluster_tasks['tasks']:
        if cluster_task['name'] != task_name:
            continue
        statuses += cluster_task['statuses']
    history = [
        entry['state']
        for entry in sorted(statuses, key=lambda x: x['timestamp'])
    ]
    log.info('Status history for task {}: {}'.format(task_name,
                                                     ', '.join(history)))
    return history
Example #40
0
 def wait_for_unresponsive_agent():
     try:
         response = sdk_cmd.cluster_request("GET", "/mesos/slaves", retry=False).json()
         agent_statuses = {}
         for agent in response["slaves"]:
             agent_statuses[agent["hostname"]] = agent["active"]
         log.info("Wait for {}=False: {}".format(agent_host, agent_statuses))
         # If no agents were found, try again
         if len(agent_statuses) == 0:
             return True
         # If other agents are listed, but not OUR agent, assume that OUR agent is now inactive.
         # (Shouldn't happen, but just in case...)
         return agent_statuses.get(agent_host, False)
     except Exception as e:
         log.info(e)
         log.info(traceback.format_exc())
         # Try again. Wait for the ip to be definitively inactive.
         return True
Example #41
0
 def wait_for_unresponsive_agent() -> bool:
     try:
         response = sdk_cmd.cluster_request("GET", "/mesos/slaves", retry=False).json()
         agent_statuses = {}
         for agent in response["slaves"]:
             agent_statuses[agent["hostname"]] = agent["active"]
         log.info("Wait for {}=False: {}".format(agent_host, agent_statuses))
         # If no agents were found, try again
         if len(agent_statuses) == 0:
             return True
         # If other agents are listed, but not OUR agent, assume that OUR agent is now inactive.
         # (Shouldn't happen, but just in case...)
         return agent_statuses.get(agent_host, False)
     except Exception as e:
         log.info(e)
         log.info(traceback.format_exc())
         # Try again. Wait for the ip to be definitively inactive.
         return True
Example #42
0
def debug_agent_files(agent_id: str) -> List[str]:
    response = sdk_cmd.cluster_request(
        "GET",
        "/slave/{}/files/debug".format(agent_id),
        retry=False,
        raise_on_error=False,
        log_response=False,
    )

    if is_http_server_error(response.status_code):
        # Retry.
        raise Exception(response)

    if not response.ok:
        # Retry.
        raise Exception(response)

    return response.json()
Example #43
0
def _dump_task_logs(item: pytest.Item, task_ids: List[str]) -> None:
    """
    For all of the provided tasks, downloads their task, executor, and agent logs to the artifact path for this test.
    """
    task_ids_set = set(task_ids)
    cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()
    matching_tasks_by_agent: Dict[str, List[_TaskEntry]] = {}
    for cluster_task in cluster_tasks["tasks"]:
        task_entry = _TaskEntry(cluster_task)
        if task_entry.task_id in task_ids_set:
            agent_tasks = matching_tasks_by_agent.get(task_entry.agent_id, [])
            agent_tasks.append(task_entry)
            matching_tasks_by_agent[task_entry.agent_id] = agent_tasks

    for agent_id, agent_tasks in matching_tasks_by_agent.items():
        try:
            _dump_task_logs_for_agent(item, agent_id, agent_tasks)
        except Exception:
            log.exception("Failed to get logs for agent {}".format(agent_id))
Example #44
0
def get_summary(with_completed=False, task_name=None) -> list:
    """Returns a summary of all cluster tasks in the cluster, or just a specified task.
    This may be used instead of invoking 'dcos task [--all]' directly.

    Returns a list of Task objects.
    """
    cluster_tasks = sdk_cmd.cluster_request("GET",
                                            "/mesos/tasks").json()["tasks"]
    agentid_to_hostname = _get_agentid_to_hostname()
    all_tasks = [
        Task.parse(entry, agentid_to_hostname) for entry in cluster_tasks
    ]
    output = (all_tasks if with_completed else list(
        filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks)))
    if task_name:
        output = list(filter(lambda t: t.name == task_name, output))
    log.info("Task summary (with_completed={}, task_name=[{}]):\n- {}".format(
        with_completed, task_name, "\n- ".join([str(e) for e in output])))
    return output
Example #45
0
def debug_agent_files(agent_id: str) -> List[str]:
    response = sdk_cmd.cluster_request(
        "GET",
        "/slave/{}/files/debug".format(agent_id),
        retry=False,
        raise_on_error=False,
        log_response=False,
    )

    if is_http_server_error(response.status_code):
        # Retry.
        raise Exception(response)

    if response.status_code == 404:
        return {}

    if not response.ok:
        # Retry.
        raise Exception(response)

    return response.json()
Example #46
0
def browse_agent_path(agent_id: str, agent_path: str) -> List[dict]:
    response = sdk_cmd.cluster_request(
        "GET",
        "/slave/{}/files/browse?path={}".format(agent_id, agent_path),
        retry=False,
        raise_on_error=False,
        log_response=False,
    )

    if is_http_server_error(response.status_code):
        # Retry.
        raise Exception(response)

    if response.status_code == 404:
        return []

    if not response.ok:
        # Retry.
        raise Exception(response)

    return response.json()
Example #47
0
def get_summary(with_completed: bool = False, task_name: Optional[str] = None) -> List[Task]:
    """Returns a summary of all cluster tasks in the cluster, or just a specified task.
    This may be used instead of invoking 'dcos task [--all]' directly.

    Returns a list of Task objects.
    """
    cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()["tasks"]
    agentid_to_hostname = _get_agentid_to_hostname()
    all_tasks = [Task.parse(entry, agentid_to_hostname) for entry in cluster_tasks]
    output = (
        all_tasks
        if with_completed
        else list(filter(lambda t: t.state not in COMPLETED_TASK_STATES, all_tasks))
    )
    if task_name:
        output = list(filter(lambda t: t.name == task_name, output))
    log.info(
        "Task summary (with_completed={}, task_name=[{}]):\n- {}".format(
            with_completed, task_name, "\n- ".join([str(e) for e in output])
        )
    )
    return output
Example #48
0
def _get_service_tasks(
    service_name: str,
    agentid_to_hostname: dict,
    task_prefix: str = "",
    with_completed_tasks: bool = False,
) -> list:
    """Returns a summary of all tasks in the specified Mesos framework.

    Returns a list of Task objects.
    """
    cluster_frameworks = sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"]
    service_tasks: List[Task] = []
    for fwk in cluster_frameworks:
        if not fwk["name"] == service_name or not fwk["active"]:
            continue
        service_tasks += [Task.parse(entry, agentid_to_hostname) for entry in fwk["tasks"]]
        if with_completed_tasks:
            service_tasks += [
                Task.parse(entry, agentid_to_hostname) for entry in fwk["completed_tasks"]
            ]
    if task_prefix:
        service_tasks = [t for t in service_tasks if t.name.startswith(task_prefix)]
    return service_tasks
def _get_config_once(app_name):
    return sdk_cmd.cluster_request('GET', _api_url('apps/{}'.format(app_name)), retry=False)
def restart_app(app_name):
    log.info("Restarting {}...".format(app_name))
    # throws on failure:
    sdk_cmd.cluster_request('POST', _api_url('apps/{}/restart'.format(app_name)))
    log.info("Restarted {}.".format(app_name))
Example #51
0
def dcos_version() -> str:
    response = sdk_cmd.cluster_request("GET", "/dcos-metadata/dcos-version.json")
    response_json = response.json()
    return str(response_json["version"])
Example #52
0
 def get_zk_node_data(node_name):
     return sdk_cmd.cluster_request(
         "GET", "/exhibitor/exhibitor/v1/explorer/node-data?key={}".format(node_name)
     ).json()
Example #53
0
def get_agents() -> List[Dict[str, Any]]:
    response = sdk_cmd.cluster_request("GET", "/mesos/slaves")
    response_json = response.json()
    return list(response_json["slaves"])
Example #54
0
def get_metadata() -> requests.Response:
    return sdk_cmd.cluster_request('GET',
                                   'dcos-metadata/bootstrap-config.json',
                                   retry=False)
Example #55
0
 def _wait_for_active_framework() -> bool:
     return len(list(filter(
         lambda fwk: fwk["name"] == service_name and fwk["active"],
         sdk_cmd.cluster_request("GET", "/mesos/frameworks").json()["frameworks"]
     ))) > 0