Example #1
0
class LockCollector(collector.CaptureSamplerCollector):
    """Record lock usage."""

    nframes = attr.ib(
        factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int))
    endpoint_collection_enabled = attr.ib(factory=attr_utils.from_env(
        "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", True, formats.asbool))
    tracer = attr.ib(default=None)

    _original = attr.ib(init=False, repr=False, type=typing.Any, cmp=False)

    @abc.abstractmethod
    def _get_original(self):
        # type: (...) -> typing.Any
        pass

    @abc.abstractmethod
    def _set_original(
            self,
            value  # type: typing.Any
    ):
        # type: (...) -> None
        pass

    def _start_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Start collecting lock usage."""
        self.patch()
        super(LockCollector, self)._start_service()

    def _stop_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Stop collecting lock usage."""
        super(LockCollector, self)._stop_service()
        self.unpatch()

    def patch(self):
        # type: (...) -> None
        """Patch the module for tracking lock allocation."""
        # We only patch the lock from the `threading` module.
        # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile.
        self.original = self._get_original()

        def _allocate_lock(wrapped, instance, args, kwargs):
            lock = wrapped(*args, **kwargs)
            return self.PROFILED_LOCK_CLASS(lock, self.recorder, self.tracer,
                                            self.nframes,
                                            self._capture_sampler,
                                            self.endpoint_collection_enabled)

        self._set_original(FunctionWrapper(self.original, _allocate_lock))

    def unpatch(self):
        # type: (...) -> None
        """Unpatch the threading module for tracking lock allocation."""
        self._set_original(self.original)
Example #2
0
class LockCollector(collector.CaptureSamplerCollector):
    """Record lock usage."""

    nframes = attr.ib(
        factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int))
    endpoint_collection_enabled = attr.ib(factory=attr_utils.from_env(
        "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED", True, formats.asbool))

    tracer = attr.ib(default=None)

    def _start_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Start collecting `threading.Lock` usage."""
        self.patch()
        super(LockCollector, self)._start_service()

    def _stop_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Stop collecting `threading.Lock` usage."""
        super(LockCollector, self)._stop_service()
        self.unpatch()

    def patch(self):
        # type: (...) -> None
        """Patch the threading module for tracking lock allocation."""
        # We only patch the lock from the `threading` module.
        # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile.
        self.original = threading.Lock

        def _allocate_lock(wrapped, instance, args, kwargs):
            lock = wrapped(*args, **kwargs)
            return _ProfiledLock(lock, self.recorder, self.tracer,
                                 self.nframes, self._capture_sampler,
                                 self.endpoint_collection_enabled)

        threading.Lock = FunctionWrapper(self.original,
                                         _allocate_lock)  # type: ignore[misc]

    def unpatch(self):
        # type: (...) -> None
        """Unpatch the threading module for tracking lock allocation."""
        threading.Lock = self.original  # type: ignore[misc]
Example #3
0
class Scheduler(periodic.PeriodicService):
    """Schedule export of recorded data."""

    recorder = attr.ib()
    exporters = attr.ib()
    before_flush = attr.ib(default=None, eq=False)
    _interval = attr.ib(factory=attr_utils.from_env("DD_PROFILING_UPLOAD_INTERVAL", 60.0, float))
    _configured_interval = attr.ib(init=False)
    _last_export = attr.ib(init=False, default=None, eq=False)

    def __attrs_post_init__(self):
        # Copy the value to use it later since we're going to adjust the real interval
        self._configured_interval = self.interval

    def _start_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Start the scheduler."""
        LOG.debug("Starting scheduler")
        super(Scheduler, self)._start_service()
        self._last_export = compat.time_ns()
        LOG.debug("Scheduler started")

    def flush(self):
        """Flush events from recorder to exporters."""
        LOG.debug("Flushing events")
        if self.before_flush is not None:
            try:
                self.before_flush()
            except Exception:
                LOG.error("Scheduler before_flush hook failed", exc_info=True)
        if self.exporters:
            events = self.recorder.reset()
            start = self._last_export
            self._last_export = compat.time_ns()
            for exp in self.exporters:
                try:
                    exp.export(events, start, self._last_export)
                except exporter.ExportError as e:
                    LOG.error("Unable to export profile: %s. Ignoring.", _traceback.format_exception(e))
                except Exception:
                    LOG.exception(
                        "Unexpected error while exporting events. "
                        "Please report this bug to https://github.com/DataDog/dd-trace-py/issues"
                    )

    def periodic(self):
        start_time = compat.monotonic()
        try:
            self.flush()
        finally:
            self.interval = max(0, self._configured_interval - (compat.monotonic() - start_time))
Example #4
0
class CaptureSamplerCollector(Collector):
    capture_pct = attr.ib(factory=attr_utils.from_env("DD_PROFILING_CAPTURE_PCT", 2.0, float))
    _capture_sampler = attr.ib(default=attr.Factory(_create_capture_sampler, takes_self=True), init=False, repr=False)
Example #5
0
class MemoryCollector(collector.PeriodicCollector):
    """Memory allocation collector."""

    _DEFAULT_MAX_EVENTS = 32
    _DEFAULT_INTERVAL = 0.5

    # Arbitrary interval to empty the _memalloc event buffer
    _interval = attr.ib(default=_DEFAULT_INTERVAL, repr=False)

    # TODO make this dynamic based on the 1. interval and 2. the max number of events allowed in the Recorder
    _max_events = attr.ib(factory=attr_utils.from_env(
        "_DD_PROFILING_MEMORY_EVENTS_BUFFER", _DEFAULT_MAX_EVENTS, int))
    max_nframe = attr.ib(
        factory=attr_utils.from_env("DD_PROFILING_MAX_FRAMES", 64, int))
    heap_sample_size = attr.ib(type=int, factory=_get_default_heap_sample_size)
    ignore_profiler = attr.ib(factory=attr_utils.from_env(
        "DD_PROFILING_IGNORE_PROFILER", False, formats.asbool))

    def _start_service(self):  # type: ignore[override]
        # type: (...) -> None
        """Start collecting memory profiles."""
        if _memalloc is None:
            raise collector.CollectorUnavailable

        _memalloc.start(self.max_nframe, self._max_events,
                        self.heap_sample_size)

        super(MemoryCollector, self)._start_service()

    def _stop_service(self):  # type: ignore[override]
        # type: (...) -> None
        super(MemoryCollector, self)._stop_service()

        if _memalloc is not None:
            try:
                _memalloc.stop()
            except RuntimeError:
                pass

    def _get_thread_id_ignore_set(self):
        # type: () -> typing.Set[int]
        # This method is not perfect and prone to race condition in theory, but very little in practice.
        # Anyhow it's not a big deal — it's a best effort feature.
        return {
            thread.ident
            for thread in threading.enumerate()
            if getattr(thread, "_ddtrace_profiling_ignore", False)
            and thread.ident is not None
        }

    def snapshot(self):
        thread_id_ignore_set = self._get_thread_id_ignore_set()
        return (tuple(
            MemoryHeapSampleEvent(
                thread_id=thread_id,
                thread_name=_threading.get_thread_name(thread_id),
                thread_native_id=_threading.get_thread_native_id(thread_id),
                frames=stack,
                nframes=nframes,
                size=size,
                sample_size=self.heap_sample_size,
            ) for (stack, nframes, thread_id), size in _memalloc.heap()
            if not self.ignore_profiler
            or thread_id not in thread_id_ignore_set), )

    def collect(self):
        events, count, alloc_count = _memalloc.iter_events()
        capture_pct = 100 * count / alloc_count
        thread_id_ignore_set = self._get_thread_id_ignore_set()
        # TODO: The event timestamp is slightly off since it's going to be the time we copy the data from the
        # _memalloc buffer to our Recorder. This is fine for now, but we might want to store the nanoseconds
        # timestamp in C and then return it via iter_events.
        return (tuple(
            MemoryAllocSampleEvent(
                thread_id=thread_id,
                thread_name=_threading.get_thread_name(thread_id),
                thread_native_id=_threading.get_thread_native_id(thread_id),
                frames=stack,
                nframes=nframes,
                size=size,
                capture_pct=capture_pct,
                nevents=alloc_count,
            ) for (stack, nframes, thread_id), size in events
            if not self.ignore_profiler
            or thread_id not in thread_id_ignore_set), )
Example #6
0
class PprofHTTPExporter(pprof.PprofExporter):
    """PProf HTTP exporter."""

    endpoint = attr.ib()
    api_key = attr.ib(default=None)
    # Do not use the default agent timeout: it is too short, the agent is just a unbuffered proxy and the profiling
    # backend is not as fast as the tracer one.
    timeout = attr.ib(factory=attr_utils.from_env("DD_PROFILING_API_TIMEOUT", 10.0, float), type=float)
    service = attr.ib(default=None)
    env = attr.ib(default=None)
    version = attr.ib(default=None)
    tags = attr.ib(factory=dict)
    max_retry_delay = attr.ib(default=None)
    _container_info = attr.ib(factory=container.get_container_info, repr=False)
    _retry_upload = attr.ib(init=False, eq=False)
    endpoint_path = attr.ib(default="/profiling/v1/input")

    def __attrs_post_init__(self):
        if self.max_retry_delay is None:
            self.max_retry_delay = self.timeout * 3
        self._retry_upload = tenacity.Retrying(
            # Retry after 1s, 2s, 4s, 8s with some randomness
            wait=tenacity.wait_random_exponential(multiplier=0.5),
            stop=tenacity.stop_after_delay(self.max_retry_delay),
            retry_error_cls=UploadFailed,
            retry=tenacity.retry_if_exception_type((http_client.HTTPException, OSError, IOError)),
        )
        tags = {
            k: six.ensure_binary(v)
            for k, v in itertools.chain(
                parse_tags_str(os.environ.get("DD_TAGS")).items(),
                parse_tags_str(os.environ.get("DD_PROFILING_TAGS")).items(),
            )
        }
        tags.update({k: six.ensure_binary(v) for k, v in self.tags.items()})
        tags.update(
            {
                "host": HOSTNAME.encode("utf-8"),
                "language": b"python",
                "runtime": PYTHON_IMPLEMENTATION,
                "runtime_version": PYTHON_VERSION,
                "profiler_version": ddtrace.__version__.encode("ascii"),
            }
        )
        if self.version:
            tags["version"] = self.version.encode("utf-8")

        if self.env:
            tags["env"] = self.env.encode("utf-8")

        self.tags = tags

    @staticmethod
    def _encode_multipart_formdata(fields, tags):
        boundary = binascii.hexlify(os.urandom(16))

        # The body that is generated is very sensitive and must perfectly match what the server expects.
        body = (
            b"".join(
                b"--%s\r\n"
                b'Content-Disposition: form-data; name="%s"\r\n'
                b"\r\n"
                b"%s\r\n" % (boundary, field.encode(), value)
                for field, value in fields.items()
                if field != "chunk-data"
            )
            + b"".join(
                b"--%s\r\n"
                b'Content-Disposition: form-data; name="tags[]"\r\n'
                b"\r\n"
                b"%s:%s\r\n" % (boundary, tag.encode(), value)
                for tag, value in tags.items()
            )
            + b"--"
            + boundary
            + b"\r\n"
            b'Content-Disposition: form-data; name="chunk-data"; filename="profile.pb.gz"\r\n'
            + b"Content-Type: application/octet-stream\r\n\r\n"
            + fields["chunk-data"]
            + b"\r\n--%s--\r\n" % boundary
        )

        content_type = b"multipart/form-data; boundary=%s" % boundary

        return content_type, body

    def _get_tags(self, service):
        tags = {
            "service": service.encode("utf-8"),
            "runtime-id": runtime.get_runtime_id().encode("ascii"),
        }

        tags.update(self.tags)

        return tags

    def export(self, events, start_time_ns, end_time_ns):
        """Export events to an HTTP endpoint.

        :param events: The event dictionary from a `ddtrace.profiling.recorder.Recorder`.
        :param start_time_ns: The start time of recording.
        :param end_time_ns: The end time of recording.
        """
        if self.api_key:
            headers = {
                "DD-API-KEY": self.api_key.encode(),
            }
        else:
            headers = {}

        if self._container_info and self._container_info.container_id:
            headers["Datadog-Container-Id"] = self._container_info.container_id

        profile = super(PprofHTTPExporter, self).export(events, start_time_ns, end_time_ns)
        s = six.BytesIO()
        with gzip.GzipFile(fileobj=s, mode="wb") as gz:
            gz.write(profile.SerializeToString())
        fields = {
            "runtime-id": runtime.get_runtime_id().encode("ascii"),
            "recording-start": (
                datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"
            ).encode(),
            "recording-end": (
                datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"
            ).encode(),
            "runtime": PYTHON_IMPLEMENTATION,
            "format": b"pprof",
            "type": b"cpu+alloc+exceptions",
            "chunk-data": s.getvalue(),
        }

        service = self.service or os.path.basename(profile.string_table[profile.mapping[0].filename])

        content_type, body = self._encode_multipart_formdata(
            fields,
            tags=self._get_tags(service),
        )
        headers["Content-Type"] = content_type

        client = agent.get_connection(self.endpoint, self.timeout)
        self._upload(client, self.endpoint_path, body, headers)

    def _upload(self, client, path, body, headers):
        self._retry_upload(self._upload_once, client, path, body, headers)

    def _upload_once(self, client, path, body, headers):
        try:
            client.request("POST", path, body=body, headers=headers)
            response = client.getresponse()
            response.read()  # reading is mandatory
        finally:
            client.close()

        if 200 <= response.status < 300:
            return

        if 500 <= response.status < 600:
            raise tenacity.TryAgain

        if response.status == 400:
            raise exporter.ExportError("Server returned 400, check your API key")
        elif response.status == 404 and not self.api_key:
            raise exporter.ExportError(
                "Datadog Agent is not accepting profiles. "
                "Agent-based profiling deployments require Datadog Agent >= 7.20"
            )

        raise exporter.ExportError("HTTP Error %d" % response.status)