Ejemplo n.º 1
0
def get_kubernetes_base_url():
    """Returns the base URL for the Kubernetes master.

  Uses the environment variables for the kubernetes service.

  Additionally, the environment variable KUBERNETES_API can be used
  to override the returned URL.

  Returns:
    The base URL for the Kubernetes master, including the API prefix.

  Raises:
    CollectorError: if the environment variable KUBERNETES_SERVICE_HOST
    or KUBERNETES_SERVICE_PORT is not defined or empty.
  """
    try:
        return os.environ['KUBERNETES_API']
    except KeyError:
        pass

    service_host = os.environ.get('KUBERNETES_SERVICE_HOST')
    if not service_host:
        raise collector_error.CollectorError(
            'KUBERNETES_SERVICE_HOST environment variable is not set')

    service_port = os.environ.get('KUBERNETES_SERVICE_PORT')
    if not service_port:
        raise collector_error.CollectorError(
            'KUBERNETES_SERVICE_PORT environment variable is not set')

    return KUBERNETES_API % (service_host, service_port)
Ejemplo n.º 2
0
def get_rcontrollers(gs):
    """Gets the list of replication controllers in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped replication controller objects.
    Each element in the list is the result of
    utilities.wrap_object(rcontroller, 'ReplicationController', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    rcontrollers, ts = gs.get_rcontrollers_cache().lookup('')
    if ts is not None:
        app.logger.debug(
            'get_rcontrollers() cache hit returns %d rcontrollers',
            len(rcontrollers))
        return rcontrollers

    rcontrollers = []
    url = get_kubernetes_base_url() + '/replicationcontrollers'

    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for rcontroller in result['items']:
        name = utilities.get_attribute(rcontroller, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid replication controller without a valid rcontroller ID.
            continue

        rcontrollers.append(
            utilities.wrap_object(rcontroller, 'ReplicationController', name,
                                  now))

    ret_value = gs.get_rcontrollers_cache().update('', rcontrollers, now)
    app.logger.info('get_rcontrollers() returns %d rcontrollers',
                    len(rcontrollers))
    return ret_value
Ejemplo n.º 3
0
def get_nodes(gs):
    """Gets the list of all nodes in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped node objects.
    Each element in the list is the result of
    utilities.wrap_object(node, 'Node', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    nodes, timestamp_secs = gs.get_nodes_cache().lookup('')
    if timestamp_secs is not None:
        gs.logger_info('get_nodes() cache hit returns %d nodes', len(nodes))
        return nodes

    nodes = []
    url = '{kubernetes}/nodes'.format(kubernetes=KUBERNETES_API)
    try:
        result = fetch_data(gs, url)
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, types.DictType) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    for node in result['items']:
        name = utilities.get_attribute(node, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid node without a valid node ID value.
            continue
        wrapped_node = utilities.wrap_object(
            node,
            'Node',
            name,
            now,
            label=utilities.node_id_to_host_name(name))
        nodes.append(wrapped_node)

    ret_value = gs.get_nodes_cache().update('', nodes, now)
    gs.logger_info('get_nodes() returns %d nodes', len(nodes))
    return ret_value
Ejemplo n.º 4
0
def fetch_data(gs, url, base_name, expect_missing=False):
    """Fetch the named URL from Kubernetes (in production) or a file (in a test).

  The input is always JSON. It is converted to an internal representation
  by this routine.

  Args:
    gs: global state.
    url: the URL to fetch the data from when running in production.
    base_name: fetch the data from the file
      'testdata/' + base_name + '.input.json'
      when running in test mode.
    expect_missing: if True, then do not die in test mode when the test file
      is missing. Just raise ValueError. If False and the test file is not
      found in test mode, raise CollectorError.

  Returns:
  The data after converting it from JSON.

  Raises:
  ValueError: when 'expect_missing' is True and failed to open the file.
  CollectorError: if any other exception occured or 'expect_missing' is False.
  other exceptions which may be raised by fetching the URL in production mode.
  """
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(url, types.StringTypes)
    assert isinstance(base_name, types.StringTypes)
    if gs.get_testing():
        # Read the data from a file.
        fname = 'testdata/' + base_name + '.input.json'
        try:
            f = open(fname, 'r')
            v = json.loads(f.read())
            f.close()
            return v
        except IOError:
            # File not found
            if expect_missing:
                raise ValueError
            else:
                msg = 'failed to read %s' % fname
                gs.logger_exception(msg)
                raise collector_error.CollectorError(msg)
        except:
            msg = 'reading %s failed with exception %s' % (fname,
                                                           sys.exc_info()[0])
            gs.logger_exception(msg)
            raise collector_error.CollectorError(msg)
    else:
        # Send the request to Kubernetes
        return requests.get(url).json()
Ejemplo n.º 5
0
def get_services(gs):
    """Gets the list of services in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped service objects.
    Each element in the list is the result of
    utilities.wrap_object(service, 'Service', ...)

    (list_of_services, timestamp_in_seconds)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    services, timestamp_secs = gs.get_services_cache().lookup('')
    if timestamp_secs is not None:
        app.logger.debug('get_services() cache hit returns %d services',
                         len(services))
        return services

    services = []
    url = get_kubernetes_base_url() + '/services'
    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for service in result['items']:
        name = utilities.get_attribute(service, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid service without a valid service ID.
            continue
        services.append(utilities.wrap_object(service, 'Service', name, now))

    ret_value = gs.get_services_cache().update('', services, now)
    app.logger.info('get_services() returns %d services', len(services))
    return ret_value
Ejemplo n.º 6
0
def _container_in_pod(gs, container, pod):
    """Returns True when 'container' is a part of 'pod'.

  Args:
    gs: global state.
    container: a wrapped container object.
    pod: a wrapped pod object.

  Raises:
    CollectorError: if the 'container' or the 'pod' are missing essential
    attributes.

  Returns:
  True iff container 'container' is a part of 'pod'.
  """
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.is_wrapped_object(container, 'Container')
    assert utilities.is_wrapped_object(pod, 'Pod')

    parent_pod_id = utilities.get_parent_pod_id(container)
    if not utilities.valid_string(parent_pod_id):
        msg = 'could not find parent pod ID in container %s' % container['id']
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    return parent_pod_id == pod['id']
Ejemplo n.º 7
0
def _do_compute_service(gs, cluster_guid, service, g):
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(service, 'Service')
    assert isinstance(g, ContextGraph)

    service_id = service['id']
    service_guid = 'Service:' + service_id
    g.add_resource(service_guid, service['annotations'], 'Service',
                   service['timestamp'], service['properties'])

    # Cluster contains Service.
    g.add_relation(cluster_guid, service_guid, 'contains')

    # Pods load balanced by this service (use the service['spec', 'selector']
    # key/value pairs to find matching Pods)
    selector = utilities.get_attribute(service,
                                       ['properties', 'spec', 'selector'])
    if selector:
        if not isinstance(selector, types.DictType):
            msg = 'Service id=%s has an invalid "selector" value' % service_id
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        for pod in kubernetes.get_selected_pods(gs, selector):
            pod_guid = 'Pod:' + pod['id']
            # Service loadBalances Pod
            g.add_relation(service_guid, pod_guid, 'loadBalances')
    else:
        gs.logger_error('Service id=%s has no "selector" attribute',
                        service_id)
Ejemplo n.º 8
0
def _do_compute_rcontroller(gs, cluster_guid, rcontroller, g):
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(rcontroller, 'ReplicationController')
    assert isinstance(g, ContextGraph)

    rcontroller_id = rcontroller['id']
    rcontroller_guid = 'ReplicationController:' + rcontroller_id
    g.add_resource(rcontroller_guid, rcontroller['annotations'],
                   'ReplicationController', rcontroller['timestamp'],
                   rcontroller['properties'])

    # Cluster contains Rcontroller
    g.add_relation(cluster_guid, rcontroller_guid, 'contains')

    # Pods that are monitored by this replication controller.
    # Use the rcontroller['spec']['selector'] key/value pairs to find matching
    # pods.
    selector = utilities.get_attribute(rcontroller,
                                       ['properties', 'spec', 'selector'])
    if selector:
        if not isinstance(selector, types.DictType):
            msg = ('Rcontroller id=%s has an invalid "replicaSelector" value' %
                   rcontroller_id)
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        for pod in kubernetes.get_selected_pods(gs, selector):
            pod_guid = 'Pod:' + pod['id']
            # Rcontroller monitors Pod
            g.add_relation(rcontroller_guid, pod_guid, 'monitors')
    else:
        gs.logger_error('Rcontroller id=%s has no "spec.selector" attribute',
                        rcontroller_id)
Ejemplo n.º 9
0
def get_kubernetes_bearer_token():
    """Reads the bearer token required to call the Kubernetes master from a file.

  The file is installed in every container within a Kubernetes pod by the
  Kubelet. The path to the file is documented at
  https://github.com/GoogleCloudPlatform/kubernetes/blob/master/docs/accessing-the-cluster.md.

  Returns:
    The contents of the token file as a string for use in the Authorization
    header as a bearer token: 'Authorization: Bearer <token>'

  Raises:
    IOError: if cannot open the token file.
    CollectorError: if the file is empty.
  """
    # TODO(eran): add a lock around the global KUBERNETES_BEARER_TOKEN.
    global KUBERNETES_BEARER_TOKEN
    if not KUBERNETES_BEARER_TOKEN:
        with open(KUBERNETES_BEARER_TOKEN_FILE, 'r') as token_file:
            KUBERNETES_BEARER_TOKEN = token_file.read()
        if not KUBERNETES_BEARER_TOKEN:
            raise collector_error.CollectorError(
                'Cannot read Kubernetes bearer token from %s' %
                KUBERNETES_BEARER_TOKEN_FILE)

    return KUBERNETES_BEARER_TOKEN
Ejemplo n.º 10
0
def get_pods(gs):
    """Gets the list of all pods in the cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped pod objects.
    Each element in the list is the result of
    utilities.wrap_object(pod, 'Pod', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    pods, timestamp_secs = gs.get_pods_cache().lookup('')
    if timestamp_secs is not None:
        app.logger.debug('get_pods() cache hit returns %d pods', len(pods))
        return pods

    pods = []
    url = get_kubernetes_base_url() + '/pods'
    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for pod in result['items']:
        name = utilities.get_attribute(pod, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid pod without a valid pod ID value.
            continue
        wrapped_pod = utilities.wrap_object(pod, 'Pod', name, now)
        pods.append(wrapped_pod)

    ret_value = gs.get_pods_cache().update('', pods, now)
    app.logger.info('get_pods() returns %d pods', len(pods))
    return ret_value
Ejemplo n.º 11
0
def invalid_processes(gs, url):
    """Raise the CollectorError exception because the response is invalid.

  Args:
    gs: global state.
    url: the source of the invalid data is this URL.

  Raises:
    CollectorError: always raises this exception.
  """
    msg = 'process information from URL %s is invalid' % url
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)
Ejemplo n.º 12
0
def _inspect_container(gs, docker_host, container_id):
    """Fetch detailed information about the given container in the given host.

  Args:
    gs: global state.
    docker_host: Docker host name. Must not be empty.
    container_id: container ID. Must not be empty.

  Returns:
    (container_information, timestamp_in_seconds) if the container was found.
    (None, None) if the container was not found.

  Raises:
    CollectorError in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
    url = 'http://{docker_host}:{port}/containers/{container_id}/json'.format(
        docker_host=docker_host,
        port=gs.get_docker_port(),
        container_id=container_id)
    # A typical value of 'docker_host' is:
    # k8s-guestbook-node-3.c.rising-apricot-840.internal
    # Use only the first period-seperated element for the test file name.
    # The typical value of 'container_id' is:
    # k8s_php-redis.b317029a_guestbook-controller-ls6k1.default.api_f991d53e-b949-11e4-8246-42010af0c3dd_8dcdfec8
    # Use just the tail of the container ID after the last '_' sign.
    fname = '{host}-container-{id}'.format(host=docker_host.split('.')[0],
                                           id=container_id.split('_')[-1])
    try:
        result = fetch_data(gs, url, fname, expect_missing=True)
    except ValueError:
        # TODO(vasbala): this container does not exist anymore.
        # What should we do here?
        return (None, time.time())
    except collector_error.CollectorError:
        raise
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    # Sort the "Env" attribute because it tends to contain elements in
    # a different order each time you fetch the container information.
    if isinstance(utilities.get_attribute(result, ['Config', 'Env']),
                  types.ListType):
        # Sort the contents of the 'Env' list in place.
        result['Config']['Env'].sort()

    return (result, time.time())
Ejemplo n.º 13
0
    def dump(self, gs, output_format):
        """Returns the context graph in the specified format."""
        assert isinstance(gs, global_state.GlobalState)
        assert isinstance(output_format, types.StringTypes)

        if output_format == 'dot':
            return self.to_dot_graph()
        elif output_format == 'context_graph':
            return self.to_context_graph()
        elif output_format == 'resources':
            return self.to_context_resources()
        else:
            msg = 'invalid dump() output_format: %s' % output_format
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)
Ejemplo n.º 14
0
  def dump(self, output_format):
    """Returns the context graph in the specified format."""
    assert isinstance(output_format, types.StringTypes)

    self._context_resources.sort(key=lambda x: x['id'])
    self._context_relations.sort(key=lambda x: (x['source'], x['target']))

    if output_format == 'dot':
      return self.to_dot_graph()
    elif output_format == 'context_graph':
      return self.to_context_graph()
    elif output_format == 'resources':
      return self.to_context_resources()
    else:
      msg = 'invalid dump() output_format: %s' % output_format
      app.logger.error(msg)
      raise collector_error.CollectorError(msg)
Ejemplo n.º 15
0
def _do_compute_pod(gs, input_queue, node_guid, pod, g):
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert utilities.valid_string(node_guid)
    assert utilities.is_wrapped_object(pod, 'Pod')
    assert isinstance(g, ContextGraph)

    pod_id = pod['id']
    pod_guid = 'Pod:' + pod_id
    docker_host = utilities.get_attribute(pod, ['properties', 'spec', 'host'])
    if not utilities.valid_string(docker_host):
        msg = ('Docker host (pod.properties.spec.host) '
               'not found in pod ID %s' % pod_id)
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                   pod['properties'])
    g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod

    # Containers in a Pod
    for container in docker.get_containers_with_metrics(gs, docker_host):
        if not _container_in_pod(gs, container, pod):
            continue

        # Do not compute the containers by worker threads in test mode because the
        # order of the output will be different than the golden files due to the
        # effects of queuing the work.
        if gs.get_testing():
            _do_compute_container(gs, docker_host, pod_guid, container, g)
        else:
            input_queue.put((gs.get_random_priority(), _do_compute_container, {
                'gs': gs,
                'docker_host': docker_host,
                'pod_guid': pod_guid,
                'container': container,
                'g': g
            }))
Ejemplo n.º 16
0
def get_selected_pods(gs, selector):
    """Gets the list of pods in the current cluster matching 'selector'.

  The matching pods must contain all of the key/value pairs in 'selector'.

  Args:
    gs: global state.
    selector: a dictionary of key/value pairs describing the labels of
      the matching pods.

  Returns:
    list of wrapped pod objects.
    Each element in the list is the result of
    utilities.wrap_object(pod, 'Pod', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    try:
        all_pods = get_pods(gs)
    except collector_error.CollectorError:
        raise
    except:
        msg = 'get_pods() failed with exception %s' % sys.exc_info()[0]
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    pods = []
    # select the pods with the matching labels.
    for pod in all_pods:
        if matching_labels(pod, selector):
            pods.append(pod)

    gs.logger_info('get_selected_pods(labels=%s) returns %d pods',
                   str(selector), len(pods))
    return pods
Ejemplo n.º 17
0
def _do_compute_graph(gs, input_queue, output_queue, output_format):
    """Returns the context graph in the specified format.

  Args:
    gs: the global state.
    input_queue: the input queue for the worker threads.
    output_queue: output queue containing exceptions data from the worker
        threads.
    output_format: one of 'graph', 'dot', 'context_graph', or 'resources'.

  Returns:
    A successful response in the specified format.

  Raises:
    CollectorError: inconsistent or invalid graph data.
  """
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert isinstance(output_queue, Queue.Queue)
    assert utilities.valid_string(output_format)

    g = ContextGraph()
    g.set_version(docker.get_version(gs))
    g.set_metadata({'timestamp': utilities.now()})
    g.set_relations_to_timestamps(gs.get_relations_to_timestamps())

    # Nodes
    nodes_list = kubernetes.get_nodes_with_metrics(gs)
    if not nodes_list:
        return g.dump(gs, output_format)

    # Find the timestamp of the oldest node. This will be the timestamp of
    # the cluster.
    oldest_timestamp = utilities.now()
    for node in nodes_list:
        assert utilities.is_wrapped_object(node, 'Node')
        # note: we cannot call min(oldest_timestamp, node['timestamp']) here
        # because min(string) returnes the smallest character in the string.
        if node['timestamp'] < oldest_timestamp:
            oldest_timestamp = node['timestamp']

    # Get the cluster name from the first node.
    # The cluster name is an approximation. It is not a big deal if it
    # is incorrect, since the aggregator knows the cluster name.
    cluster_name = utilities.node_id_to_cluster_name(nodes_list[0]['id'])
    cluster_guid = 'Cluster:' + cluster_name
    g.set_title(cluster_name)
    g.add_resource(cluster_guid, {'label': cluster_name}, 'Cluster',
                   oldest_timestamp, {})

    # Nodes
    for node in nodes_list:
        input_queue.put((gs.get_random_priority(), _do_compute_node, {
            'gs': gs,
            'input_queue': input_queue,
            'cluster_guid': cluster_guid,
            'node': node,
            'g': g
        }))

    # Services
    for service in kubernetes.get_services(gs):
        input_queue.put((gs.get_random_priority(), _do_compute_service, {
            'gs': gs,
            'cluster_guid': cluster_guid,
            'service': service,
            'g': g
        }))

    # ReplicationControllers
    rcontrollers_list = kubernetes.get_rcontrollers(gs)
    for rcontroller in rcontrollers_list:
        input_queue.put((gs.get_random_priority(), _do_compute_rcontroller, {
            'gs': gs,
            'cluster_guid': cluster_guid,
            'rcontroller': rcontroller,
            'g': g
        }))

    # Wait until worker threads finished processing all outstanding requests.
    # Once we return from the join(), all output was generated already.
    input_queue.join()

    # Convert any exception caught by the worker threads to an exception
    # raised by the current thread.
    if not output_queue.empty():
        msg = output_queue.get_nowait()  # should not fail.
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    # Keep the relations_to_timestamps mapping for next call.
    gs.set_relations_to_timestamps(g.get_relations_to_timestamps())

    # Dump the resulting graph
    return g.dump(gs, output_format)
Ejemplo n.º 18
0
def get_containers(gs, docker_host):
    """Gets the list of all containers in 'docker_host'.

  Args:
    gs: global state.
    docker_host: the Docker host running the containers.

  Returns:
    list of wrapped container objects.
    Each element in the list is the result of
    utilities.wrap_object(container, 'Container', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
    containers, timestamp = gs.get_containers_cache().lookup(docker_host)
    if timestamp is not None:
        gs.logger_info(
            'get_containers(docker_host=%s) cache hit returns '
            '%d containers', docker_host, len(containers))
        return containers

    url = 'http://{docker_host}:{port}/containers/json'.format(
        docker_host=docker_host, port=gs.get_docker_port())
    # A typical value of 'docker_host' is:
    # k8s-guestbook-node-3.c.rising-apricot-840.internal
    # Use only the first period-seperated element for the test file name.
    fname = '{host}-containers'.format(host=docker_host.split('.')[0])
    try:
        containers_list = fetch_data(gs, url, fname)
    except collector_error.CollectorError:
        raise
    except:
        msg = ('fetching %s or %s failed with exception %s' %
               (url, fname, sys.exc_info()[0]))
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    if not isinstance(containers_list, types.ListType):
        msg = 'invalid response from fetching %s' % url
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    containers = []
    timestamps = []
    for container_info in containers_list:
        # NOTE: container 'Name' is stable across container re-starts whereas
        # container 'Id' is not.
        # This may be because Kubernertes assigns the Name while Docker assigns
        # the Id (?)
        # The container Name is the only element of the array 'Names' -
        # why is Names an array here?
        # skip the leading / in the Name
        if not (isinstance(container_info.get('Names'), types.ListType)
                and container_info['Names']
                and utilities.valid_string(container_info['Names'][0])
                and container_info['Names'][0][0] == '/'):
            msg = 'invalid containers data format. docker_host=%s' % docker_host
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        container_id = container_info['Names'][0][1:]
        container, ts = _inspect_container(gs, docker_host, container_id)
        if container is None:
            continue

        if not utilities.valid_string(container.get('Name')):
            msg = ('missing or invalid Name attribute in container %s' %
                   container_id)
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        if container['Name'] != ('/' + container_id):
            msg = ('container %s\'s Name attribute is "%s"; expecting "%s"' %
                   (container_id, container['Name'], '/' + container_id))
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        short_hex_id = utilities.object_to_hex_id(container)
        if short_hex_id is None:
            msg = 'Could not compute short hex ID of container %s' % container_id
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        wrapped_container = utilities.wrap_object(container,
                                                  'Container',
                                                  container_id,
                                                  ts,
                                                  label=short_hex_id)
        containers.append(wrapped_container)
        timestamps.append(ts)

        # Modify the container's label after the wrapped container was added
        # to the containers list.
        # Compute the container's short name to create a better container label:
        # short_container_name/short_hex_id.
        # For example: "cassandra/d85b599c17d8".
        parent_pod_id = utilities.get_parent_pod_id(wrapped_container)
        if parent_pod_id is None:
            continue
        parent_pod = kubernetes.get_one_pod(gs, docker_host, parent_pod_id)
        if parent_pod is None:
            continue
        short_container_name = utilities.get_short_container_name(
            wrapped_container, parent_pod)
        if not utilities.valid_string(short_container_name):
            continue
        wrapped_container['annotations']['label'] = (short_container_name +
                                                     '/' + short_hex_id)

    ret_value = gs.get_containers_cache().update(
        docker_host, containers,
        min(timestamps) if timestamps else time.time())
    gs.logger_info('get_containers(docker_host=%s) returns %d containers',
                   docker_host, len(containers))
    return ret_value
Ejemplo n.º 19
0
def get_containers_with_metrics(gs, docker_host):
    """Gets the list of all containers in 'docker_host' with metric annotations.

  Args:
    gs: global state.
    docker_host: the Docker host running the containers.

  Returns:
    list of wrapped container objects.
    Each element in the list is the result of
    utilities.wrap_object(container, 'Container', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
    # Create a lookup table from pod IDs to pods.
    # This lookup table is needed when annotating containers with
    # metrics. Also compute the project's name.
    containers_list = get_containers(gs, docker_host)
    if not containers_list:
        return []

    pod_id_to_pod = {}
    project_id = '_unknown_'

    # Populate the pod ID to pod lookup table.
    # Compute the project_id from the name of the first pod.
    for pod in kubernetes.get_pods(gs, docker_host):
        assert utilities.is_wrapped_object(pod, 'Pod')
        pod_id_to_pod[pod['id']] = pod
        if project_id != '_unknown_':
            continue
        pod_hostname = utilities.get_attribute(pod,
                                               ['properties', 'spec', 'host'])
        if utilities.valid_string(pod_hostname):
            project_id = utilities.node_id_to_project_id(pod_hostname)

    # We know that there are containers in this docker_host.
    if not pod_id_to_pod:
        # there are no pods in this docker_host.
        msg = 'Docker host %s has containers but no pods' % docker_host
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    # Annotate the containers with their metrics.
    for container in containers_list:
        assert utilities.is_wrapped_object(container, 'Container')

        parent_pod_id = utilities.get_parent_pod_id(container)
        if not utilities.valid_string(parent_pod_id):
            msg = ('missing or invalid parent pod ID in container %s' %
                   container['id'])
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        if parent_pod_id not in pod_id_to_pod:
            msg = ('could not locate parent pod %s for container %s' %
                   (parent_pod_id, container['id']))
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        # Note that the project ID may be '_unknown_'.
        # This is not a big deal, because the aggregator knows the project ID.
        metrics.annotate_container(project_id, container,
                                   pod_id_to_pod[parent_pod_id])

    return containers_list
Ejemplo n.º 20
0
def get_processes(gs, docker_host, container_id):
    """Gets the list of all processes in the 'docker_host' and 'container_id'.

  If the container is not found, returns an empty list of processes.

  Args:
    gs: global state.
    docker_host: the Docker host running the container.
    container_id: the container running the processes.

  Returns:
    list of wrapped process objects.
    Each element in the list is the result of
    utilities.wrap_object(process, 'Process', ...)

  Raises:
    CollectorError in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
    processes_label = '%s/%s' % (docker_host, container_id)
    processes, timestamp_secs = gs.get_processes_cache().lookup(
        processes_label)
    if timestamp_secs is not None:
        gs.logger_info(
            'get_processes(docker_host=%s, container_id=%s) cache hit',
            docker_host, container_id)
        return processes

    container = get_one_container(gs, docker_host, container_id)
    if container is not None:
        assert utilities.is_wrapped_object(container, 'Container')
        container_short_hex_id = utilities.object_to_hex_id(
            container['properties'])
        assert utilities.valid_string(container_short_hex_id)
    else:
        # Parent container not found. Container might have crashed while we were
        # looking for it.
        return []

    # NOTE: there is no trailing /json in this URL - this looks like a bug in the
    # Docker API
    url = ('http://{docker_host}:{port}/containers/{container_id}/top?'
           'ps_args=aux'.format(docker_host=docker_host,
                                port=gs.get_docker_port(),
                                container_id=container_id))
    # A typical value of 'docker_host' is:
    # k8s-guestbook-node-3.c.rising-apricot-840.internal
    # Use only the first period-seperated element for the test file name.
    # The typical value of 'container_id' is:
    # k8s_php-redis.b317029a_guestbook-controller-ls6k1.default.api_f991d53e-b949-11e4-8246-42010af0c3dd_8dcdfec8
    # Use just the tail of the container ID after the last '_' sign.
    fname = '{host}-processes-{id}'.format(host=docker_host.split('.')[0],
                                           id=container_id.split('_')[-1])

    try:
        # TODO(vasbala): what should we do in cases where the container is gone
        # (and replaced by a different one)?
        result = fetch_data(gs, url, fname, expect_missing=True)
    except ValueError:
        # this container does not exist anymore
        return []
    except collector_error.CollectorError:
        raise
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    if not isinstance(utilities.get_attribute(result, ['Titles']),
                      types.ListType):
        invalid_processes(gs, url)
    if not isinstance(utilities.get_attribute(result, ['Processes']),
                      types.ListType):
        invalid_processes(gs, url)

    pstats = result['Titles']
    processes = []
    now = time.time()
    for pvalues in result['Processes']:
        process = {}
        if not isinstance(pvalues, types.ListType):
            invalid_processes(gs, url)
        if len(pstats) != len(pvalues):
            invalid_processes(gs, url)
        for pstat, pvalue in zip(pstats, pvalues):
            process[pstat] = pvalue

        # Prefix with container Id to ensure uniqueness across the whole graph.
        process_id = '%s/%s' % (container_short_hex_id, process['PID'])
        processes.append(
            utilities.wrap_object(process,
                                  'Process',
                                  process_id,
                                  now,
                                  label=process['PID']))

    ret_value = gs.get_processes_cache().update(processes_label, processes,
                                                now)
    gs.logger_info(
        'get_processes(docker_host=%s, container_id=%s) returns %d processes',
        docker_host, container_id, len(processes))
    return ret_value
Ejemplo n.º 21
0
import docker_proxy

if __name__ == '__main__':
  parser = argparse.ArgumentParser(description='Cluster-Insight data collector')
  parser.add_argument('-d', '--debug', action='store_true',
                      help='enable debug mode')
  parser.add_argument('-p', '--port', action='store', type=int,
                      default=constants.DATA_COLLECTOR_PORT,
                      help=('data collector port number [default=%d]' %
                            constants.DATA_COLLECTOR_PORT))
  parser.add_argument('--docker_port', action='store', type=int,
                      default=constants.DOCKER_PORT,
                      help=('Docker port number [default=%d]' %
                            constants.DOCKER_PORT))
  parser.add_argument('-w', '--workers', action='store', type=int,
                      default=0,
                      help=('number of concurrent workers. A zero or a '
                            'negative value denotes an automatic calculation '
                            'of this number. [default=0]'))

  mode = os.environ.get('CLUSTER_INSIGHT_MODE')

  if mode == constants.MODE_MASTER:
    collector.main()
  elif mode == constants.MODE_MINION:
    docker_proxy.main()
  else:
    raise collector_error.CollectorError(
        'CLUSTER_INSIGHT_MODE environment variable is %s. Valid values are %s '
        'or %s' % (mode, constants.MODE_MINION, constants.MODE_MASTER))
Ejemplo n.º 22
0
def get_image(gs, docker_host, container):
    """Gets the information of the given image in the given host.

  Args:
    gs: global state.
    docker_host: Docker host name. Must not be empty.
    container: the container which runs the image.

  Returns:
    If image was found, returns the wrapped image object, which is the result of
    utilities.wrap_object(image, 'Image', ...)
    If the image was not found, returns None.

  Raises:
    CollectorError: in case of failure to fetch data from Docker.
    ValueError: in case the container does not contain a valid image ID.
    Other exceptions may be raised due to exectution errors.
  """
    assert utilities.is_wrapped_object(container, 'Container')
    # The 'image_id' should be a long hexadecimal string.
    image_id = utilities.get_attribute(container, ['properties', 'Image'])
    if not utilities.valid_hex_id(image_id):
        msg = 'missing or invalid image ID in container ID=%s' % container['id']
        gs.logger_error(msg)
        raise ValueError(msg)

    # The 'image_name' should be a symbolic name (not a hexadecimal string).
    image_name = utilities.get_attribute(container,
                                         ['properties', 'Config', 'Image'])

    if ((not utilities.valid_string(image_name))
            or utilities.valid_hex_id(image_name)):
        msg = 'missing or invalid image name in container ID=%s' % container[
            'id']
        gs.logger_error(msg)
        raise ValueError(msg)

    cache_key = '%s|%s' % (docker_host, image_id)
    image, timestamp_secs = gs.get_images_cache().lookup(cache_key)
    if timestamp_secs is not None:
        gs.logger_info('get_image(docker_host=%s, image_id=%s) cache hit',
                       docker_host, image_id)
        return image

    # A typical value of 'docker_host' is:
    # k8s-guestbook-node-3.c.rising-apricot-840.internal
    # Use only the first period-seperated element for the test file name.
    # The typical value of 'image_name' is:
    # brendanburns/php-redis
    # We convert embedded '/' and ':' characters to '-' to avoid interference with
    # the directory structure or file system.
    url = 'http://{docker_host}:{port}/images/{image_id}/json'.format(
        docker_host=docker_host, port=gs.get_docker_port(), image_id=image_id)
    fname = '{host}-image-{id}'.format(host=docker_host.split('.')[0],
                                       id=image_name.replace('/', '-').replace(
                                           ':', '-'))

    try:
        image = fetch_data(gs, url, fname, expect_missing=True)
    except ValueError:
        # image not found.
        msg = 'image not found for image_id: %s' % image_id
        gs.logger_info(msg)
        return None
    except collector_error.CollectorError:
        raise
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    # compute the two labels of the image.
    # The first is a 12-digit hexadecimal number shown by "docker images".
    # The second is the symbolic name of the image.
    full_hex_label = image.get('Id')
    if not (isinstance(full_hex_label, types.StringTypes) and full_hex_label):
        msg = 'Image id=%s has an invalid "Id" attribute value' % image_id
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    short_hex_label = utilities.object_to_hex_id(image)
    if short_hex_label is None:
        msg = 'Could not compute short hex ID of image %s' % image_id
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    wrapped_image = utilities.wrap_object(image,
                                          'Image',
                                          full_hex_label,
                                          now,
                                          label=short_hex_label,
                                          alt_label=image_name)

    ret_value = gs.get_images_cache().update(cache_key, wrapped_image, now)
    gs.logger_info('get_image(docker_host=%s, image_id=%s, image_name=%s)',
                   docker_host, image_id, image_name)
    return ret_value
Ejemplo n.º 23
0
def get_pods(gs, node_id=None):
    """Gets the list of all pods in the given node or in the cluster.

  When 'node_id' is None, it returns the list of pods in the cluster.
  When 'node_id' is a non-empty string, it returns the list of pods in that
  node.

  Args:
    gs: global state.
    node_id: the parent node of the pods or None.

  Returns:
    list of wrapped pod objects.
    Each element in the list is the result of
    utilities.wrap_object(pod, 'Pod', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    pods_label = '' if node_id is None else node_id
    pods, timestamp_secs = gs.get_pods_cache().lookup(pods_label)
    if timestamp_secs is not None:
        gs.logger_info('get_pods(pods_label=%s) cache hit returns %d pods',
                       pods_label, len(pods))
        return pods

    pods = []
    url = '{kubernetes}/pods'.format(kubernetes=KUBERNETES_API)
    try:
        result = fetch_data(gs, url)
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, types.DictType) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    for pod in result['items']:
        name = utilities.get_attribute(pod, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid pod without a valid pod ID value.
            continue
        wrapped_pod = utilities.wrap_object(pod, 'Pod', name, now)
        if node_id:
            # pod['spec']['host'] may be missing if the pod is in "Waiting"
            # status.
            if utilities.get_attribute(pod, ['spec', 'host']) == node_id:
                pods.append(wrapped_pod)
        else:
            # append pod to output if 'node_id' is not specified.
            pods.append(wrapped_pod)

    ret_value = gs.get_pods_cache().update(pods_label, pods, now)
    gs.logger_info('get_pods(node_id=%s) returns %d pods', pods_label,
                   len(pods))
    return ret_value