def metrics_histogram_timer(metric: str, timing_buckets: Iterable[int], tags: Dict[str, str] = None, bucket_tag: str = 'duration', callback: Callable = None): """ Create a context manager that times and reports to the metric providers as a histogram Example Usage: :: timer = metrics_histogram_timer('commcare.some.special.metric', tags={ 'type': type, ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a call to `metrics_histogram` with the timer value. Note: Histograms are implemented differently by each provider. See documentation for details. :param metric: Name of the metric (must start with 'commcare.') :param tags: metric tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :param bucket_tag: The name of the bucket tag to use (if used by the underlying provider) :param callback: a callable which will be called when exiting the context manager with a single argument of the timer duratio :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) metrics_histogram(metric, timer.duration, bucket_tag=bucket_tag, buckets=timing_buckets, bucket_unit='s', tags=tags) timer_name = metric if metric.startswith('commcare.'): timer_name = ".".join( metric.split('.')[1:]) # remove the 'commcare.' prefix add_breadcrumb( category="timing", message=f"{timer_name}: {timer.duration:0.3f}", level="info", ) timer.stop = new_stop return timer
def datadog_bucket_timer(metric, tags, timing_buckets, callback=None): """ create a context manager that times and reports to datadog using timing buckets adds a 'duration' tag specifying which predefined timing bucket the timing fell into, see the `bucket_value` function for more info. Example Usage: timer = datadog_bucket_timer('commcare.some.special.metric', tags=[ 'type:{}'.format(type), ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a datadog counter metric with a 'duration' tag, with the possible values lt_0.001, lt_0.01, lt_0.1, lt_001, lt_010, lt_100, and over_100. :param metric: Name of the datadog metric (must start with 'commcare.') :param tags: Datadog tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) datadog_counter(metric, tags=list(tags) + [ 'duration:%s' % bucket_value(timer.duration, timing_buckets, 's') ]) timer.stop = new_stop return timer
def datadog_bucket_timer(metric, tags, timing_buckets, callback=None): """ create a context manager that times and reports to datadog using timing buckets adds a 'duration' tag specifying which predefined timing bucket the timing fell into, see the `bucket_value` function for more info. Example Usage: timer = datadog_bucket_timer('commcare.some.special.metric', tags=[ 'type:{}'.format(type), ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a datadog counter metric with a 'duration' tag, with the possible values lt_0.001, lt_0.01, lt_0.1, lt_001, lt_010, lt_100, and over_100. :param metric: Name of the datadog metric (must start with 'commcare.') :param tags: Datadog tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) datadog_counter( metric, tags=list(tags) + ['duration:%s' % bucket_value(timer.duration, timing_buckets, 's')] ) timer.stop = new_stop return timer
class RestoreConfig(object): """ A collection of attributes associated with an OTA restore :param project: The domain object. An instance of `Domain`. :param restore_user: The restore user requesting the restore :param params: The RestoreParams associated with this (see above). :param cache_settings: The RestoreCacheSettings associated with this (see above). :param is_async: Whether to get the restore response using a celery task :param case_sync: Case sync algorithm (None -> default). """ def __init__(self, project=None, restore_user=None, params=None, cache_settings=None, is_async=False, case_sync=None): assert isinstance(restore_user, OTARestoreUser) self.project = project self.domain = project.name if project else '' self.restore_user = restore_user self.params = params or RestoreParams() self.cache_settings = cache_settings or RestoreCacheSettings() self.is_async = is_async self.restore_state = RestoreState( self.project, self.restore_user, self.params, is_async, self.cache_settings.overwrite_cache, case_sync=case_sync, ) self.force_cache = self.cache_settings.force_cache self.cache_timeout = self.cache_settings.cache_timeout self.overwrite_cache = self.cache_settings.overwrite_cache self.timing_context = TimingContext('restore-{}-{}'.format(self.domain, self.restore_user.username)) @property @memoized def sync_log(self): return self.restore_state.last_sync_log @property def async_restore_task_id_cache(self): return AsyncRestoreTaskIdCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def initial_restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id='', device_id=self.params.device_id, ) def get_response(self): is_async = self.is_async try: with self.timing_context: payload = self.get_payload() response = payload.get_http_response() except RestoreException as e: logger.exception("%s error during restore submitted by %s: %s" % (type(e).__name__, self.restore_user.username, str(e))) is_async = False response = get_simple_response_xml( str(e), ResponseNature.OTA_RESTORE_ERROR ) response = HttpResponse(response, content_type="text/xml; charset=utf-8", status=412) # precondition failed if not is_async: self._record_timing(response.status_code) return response def get_payload(self): self.validate() self.delete_initial_cached_payload_if_necessary() self.delete_cached_payload_if_necessary() cached_response = self.get_cached_response() tags = { 'domain': self.domain, 'is_async': bool(self.is_async), } if cached_response: metrics_counter('commcare.restores.cache_hits.count', tags=tags) return cached_response metrics_counter('commcare.restores.cache_misses.count', tags=tags) # Start new sync if self.is_async: response = self._get_asynchronous_payload() else: response = self.generate_payload() return response def validate(self): try: self.restore_state.validate_state() except InvalidSyncLogException as e: # This exception will get caught by the view and a 412 will be returned to the phone for resync raise RestoreException(e) def get_cached_response(self): if self.overwrite_cache: return None cache_payload_path = self.restore_payload_path_cache.get_value() return CachedResponse(cache_payload_path) def generate_payload(self, async_task=None): if async_task: self.timing_context.stop("wait_for_task_to_start") self.restore_state.start_sync() fileobj = self._generate_restore_response(async_task=async_task) try: self.restore_state.finish_sync() cached_response = self.set_cached_payload_if_necessary( fileobj, self.restore_state.duration, async_task) if async_task: fileobj.close() assert cached_response is not None response = cached_response self.timing_context.stop() # root timer self._record_timing('async') else: fileobj.seek(0) response = RestoreResponse(fileobj) except: fileobj.close() raise return response def _get_asynchronous_payload(self): new_task = False # fetch the task from celery task_id = self.async_restore_task_id_cache.get_value() if task_id: task = AsyncResult(task_id) task_exists = task.status == ASYNC_RESTORE_SENT else: task = None task_exists = False if not task_exists: # start a new task # NOTE this starts a nested timer (wait_for_task_to_start), # which will be stopped by self.generate_payload(async_task) # in the asynchronous task. It is expected that # get_async_restore_payload.delay(self) will serialize this # RestoreConfig and it's associated TimingContext before it # returns, and thereby fork the timing context. The timing # context associated with this side of the fork will not be # recorded since it is async (see self.get_response). with self.timing_context("wait_for_task_to_start"): task = get_async_restore_payload.delay(self, self.domain, self.restore_user.username) logger.info('RestoreConfig after get_async_restore_payload task is created: %r', self) new_task = True # store the task id in cache self.async_restore_task_id_cache.set_value(task.id) try: response_or_name = task.get(timeout=self._get_task_timeout(new_task)) if isinstance(response_or_name, bytes): response_or_name = response_or_name.decode('utf-8') if isinstance(response_or_name, str): response = CachedResponse(response_or_name) else: response = response_or_name except TimeoutError: # return a 202 with progress response = AsyncRestoreResponse(task, self.restore_user.username) return response def _get_task_timeout(self, new_task): # if this is a new task, wait for INITIAL_ASYNC_TIMEOUT in case # this restore completes quickly. otherwise, only wait 1 second for # a response. return INITIAL_ASYNC_TIMEOUT_THRESHOLD if new_task else 1 def _generate_restore_response(self, async_task=None): """ :returns: A file-like object containing response content. """ username = self.restore_user.username count_items = self.params.include_item_count with RestoreContent(username, count_items) as content: for provider in get_element_providers(self.timing_context): with self.timing_context(provider.__class__.__name__): content.extend(provider.get_elements(self.restore_state)) for provider in get_async_providers(self.timing_context, async_task): with self.timing_context(provider.__class__.__name__): provider.extend_response(self.restore_state, content) return content.get_fileobj() def set_cached_payload_if_necessary(self, fileobj, duration, is_async): # must cache if the duration was longer than the threshold is_long_restore = duration > timedelta(seconds=INITIAL_SYNC_CACHE_THRESHOLD) if is_async or self.force_cache or is_long_restore: type_ = 'unknown' if is_async: type_ = 'async' elif self.force_cache: type_ = 'force' elif is_long_restore: type_ = 'long' tags = { 'type': type_, } metrics_counter('commcare.restores.cache_writes', tags=tags) response = CachedResponse.save_for_later( fileobj, self.cache_timeout, self.domain, self.restore_user.user_id, ) self.restore_payload_path_cache.set_value(response.name, self.cache_timeout) return response return None def delete_cached_payload_if_necessary(self): if self.overwrite_cache and self.restore_payload_path_cache.exists(): self.restore_payload_path_cache.invalidate() def delete_initial_cached_payload_if_necessary(self): if self.sync_log: # Restores are usually cached by there sync token # but initial restores don't have a sync token, # so they're indistinguishable from each other. # Once a user syncs with a sync token, we're sure their initial sync is stale, # so delete it to avoid a stale payload if they (say) wipe the phone and sync again self.initial_restore_payload_path_cache.invalidate() def _record_timing(self, status): timing = self.timing_context assert timing.is_finished() duration = timing.duration device_id = self.params.device_id if duration > 20 or status == 412: if status == 412: # use last sync log since there is no current sync log sync_log_id = self.params.sync_log_id or 'N/A' else: sync_log = self.restore_state.current_sync_log sync_log_id = sync_log._id if sync_log else 'N/A' logger.info( "restore %s: user=%s device=%s domain=%s status=%s duration=%.3f", sync_log_id, self.restore_user.username, device_id, self.domain, status, duration, ) is_webapps = device_id and device_id.startswith("WebAppsLogin") tags = { 'status_code': status, 'device_type': 'webapps' if is_webapps else 'other', } maybe_add_domain_tag(self.domain, tags) timer_buckets = (1, 5, 20, 60, 120, 300, 600) for timer in timing.to_list(exclude_root=True): segment = None extra_tags = {} if timer.name in RESTORE_SEGMENTS: segment = RESTORE_SEGMENTS[timer.name] elif timer.name.startswith('fixture:'): segment = 'fixture' extra_tags = {'fixture': timer.name.split(':')[1]} if segment: metrics_histogram( 'commcare.restores.{}.duration.seconds'.format(segment), timer.duration, bucket_tag='duration', buckets=timer_buckets, bucket_unit='s', tags={**tags, **extra_tags} ) metrics_counter( 'commcare.restores.{}'.format(segment), tags={**tags, **extra_tags}, ) tags['type'] = 'sync' if self.params.sync_log_id else 'restore' if settings.ENTERPRISE_MODE: if self.params.app and self.params.app.copy_of: app_name = slugify(self.params.app.name) tags['app'] = '{}-{}'.format(app_name, self.params.app.version) else: tags['app'] = '' metrics_counter('commcare.restores.count', tags=tags) metrics_histogram( 'commcare.restores.duration.seconds', timing.duration, bucket_tag='duration', buckets=timer_buckets, bucket_unit='s', tags=tags ) def __repr__(self): return \ "RestoreConfig(project='{}', domain={}, restore_user={}, cache_settings='{}', " \ "is_async='{}', restore_state='{}', force_cache='{}', cache_timeout='{}', overwrite_cache='{}', " \ "timing_context='{}')".format( self.project, self.domain, self.restore_user, self.cache_settings, self.is_async, self.restore_state, self.force_cache, self.cache_timeout, self.overwrite_cache, self.timing_context.to_dict() )
class DocCounter: DD_KEY = "commcare.couchsqlmigration.processed_docs" DD_INTERVAL = 100 STATE_KEY = "doc_counts" STATE_INTERVAL = 1000 def __init__(self, statedb): self.statedb = statedb self.counts = defaultdict(int, self.statedb.get(self.STATE_KEY, {})) self.timing = TimingContext("couch_sql_migration") self.dd_session = 0 self.state_session = 0 def __enter__(self): self.timing.start() return self def __exit__(self, *exc_info): self.timing.stop() self._send_timings() @contextmanager def __call__(self, dd_type, doc_type=None): """Create counting context :param dd_type: datadog 'type' tag :param doc_type: optional doc type; doc type must be passed to the returned counter function if not provided here, and cannot be passed if it is provided here. :yields: counter function. """ tags = [f"type:{dd_type}"] args = (doc_type, ) if doc_type else () with self.timing(dd_type): try: yield partial(self.add, tags, *args) finally: self.flush(tags) def add(self, tags, doc_type, count=1): self.counts[doc_type] += count self.dd_session += count self.state_session += count if self.state_session > self.STATE_INTERVAL: self.flush(tags) elif self.dd_session > self.DD_INTERVAL: self.flush(tags, state=False) def flush(self, tags, state=True): if state: counts = dict(self.counts) self.statedb.set(self.STATE_KEY, counts) log.debug("saved doc counts: %s", counts) self.state_session = 0 if tags is not None: datadog_counter(self.DD_KEY, value=self.dd_session, tags=tags) self.dd_session = 0 def get(self, doc_type): return self.counts.get(doc_type, 0) def pop(self, doc_type): return self.counts.pop(doc_type, 0) def normalize_timing(self, doc_count): self.timing.peek().normalize_denominator = doc_count def _send_timings(self): metric_name_template = "commcare.%s.count" metric_name_template_normalized = "commcare.%s.count.normalized" for timing in self.timing.to_list(): datadog_counter(metric_name_template % timing.full_name, tags=[ 'duration:%s' % bucket_value(timing.duration, TIMING_BUCKETS) ]) if getattr(timing, "normalize_denominator", 0): datadog_counter( metric_name_template_normalized % timing.full_name, tags=[ 'duration:%s' % bucket_value( timing.duration / timing.normalize_denominator, NORMALIZED_TIMING_BUCKETS, ) ])
class RestoreConfig(object): """ A collection of attributes associated with an OTA restore :param project: The domain object. An instance of `Domain`. :param restore_user: The restore user requesting the restore :param params: The RestoreParams associated with this (see above). :param cache_settings: The RestoreCacheSettings associated with this (see above). :param is_async: Whether to get the restore response using a celery task :param case_sync: Case sync algorithm (None -> default). """ def __init__(self, project=None, restore_user=None, params=None, cache_settings=None, is_async=False, case_sync=None): assert isinstance(restore_user, OTARestoreUser) self.project = project self.domain = project.name if project else '' self.restore_user = restore_user self.params = params or RestoreParams() self.cache_settings = cache_settings or RestoreCacheSettings() self.is_async = is_async self.restore_state = RestoreState( self.project, self.restore_user, self.params, is_async, self.cache_settings.overwrite_cache, case_sync=case_sync, ) self.force_cache = self.cache_settings.force_cache self.cache_timeout = self.cache_settings.cache_timeout self.overwrite_cache = self.cache_settings.overwrite_cache self.timing_context = TimingContext('restore-{}-{}'.format(self.domain, self.restore_user.username)) @property @memoized def sync_log(self): return self.restore_state.last_sync_log @property def async_restore_task_id_cache(self): return AsyncRestoreTaskIdCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def initial_restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id='', device_id=self.params.device_id, ) def get_response(self): is_async = self.is_async try: with self.timing_context: payload = self.get_payload() response = payload.get_http_response() except RestoreException as e: logging.exception("%s error during restore submitted by %s: %s" % (type(e).__name__, self.restore_user.username, str(e))) is_async = False response = get_simple_response_xml( e.message, ResponseNature.OTA_RESTORE_ERROR ) response = HttpResponse(response, content_type="text/xml; charset=utf-8", status=412) # precondition failed if not is_async: self._record_timing(response.status_code) return response def get_payload(self): self.validate() self.delete_initial_cached_payload_if_necessary() self.delete_cached_payload_if_necessary() cached_response = self.get_cached_response() tags = [ 'domain:{}'.format(self.domain), 'is_async:{}'.format(bool(self.is_async)), ] if cached_response: datadog_counter('commcare.restores.cache_hits.count', tags=tags) return cached_response datadog_counter('commcare.restores.cache_misses.count', tags=tags) # Start new sync if self.is_async: response = self._get_asynchronous_payload() else: response = self.generate_payload() return response def validate(self): try: self.restore_state.validate_state() except InvalidSyncLogException as e: # This exception will get caught by the view and a 412 will be returned to the phone for resync raise RestoreException(e) def get_cached_response(self): if self.overwrite_cache: return None cache_payload_path = self.restore_payload_path_cache.get_value() return CachedResponse(cache_payload_path) def generate_payload(self, async_task=None): if async_task: self.timing_context.stop("wait_for_task_to_start") self.restore_state.start_sync() fileobj = self._generate_restore_response(async_task=async_task) try: self.restore_state.finish_sync() cached_response = self.set_cached_payload_if_necessary( fileobj, self.restore_state.duration, async_task) if async_task: fileobj.close() assert cached_response is not None response = cached_response self.timing_context.stop() # root timer self._record_timing('async') else: fileobj.seek(0) response = RestoreResponse(fileobj) except: fileobj.close() raise return response def _get_asynchronous_payload(self): new_task = False # fetch the task from celery task_id = self.async_restore_task_id_cache.get_value() task = AsyncResult(task_id) task_exists = task.status == ASYNC_RESTORE_SENT if not task_exists: # start a new task # NOTE this starts a nested timer (wait_for_task_to_start), # which will be stopped by self.generate_payload(async_task) # in the asynchronous task. It is expected that # get_async_restore_payload.delay(self) will serialize this # RestoreConfig and it's associated TimingContext before it # returns, and thereby fork the timing context. The timing # context associated with this side of the fork will not be # recorded since it is async (see self.get_response). with self.timing_context("wait_for_task_to_start"): task = get_async_restore_payload.delay(self, self.domain, self.restore_user.username) new_task = True # store the task id in cache self.async_restore_task_id_cache.set_value(task.id) try: response = task.get(timeout=self._get_task_timeout(new_task)) except TimeoutError: # return a 202 with progress response = AsyncRestoreResponse(task, self.restore_user.username) return response def _get_task_timeout(self, new_task): # if this is a new task, wait for INITIAL_ASYNC_TIMEOUT in case # this restore completes quickly. otherwise, only wait 1 second for # a response. return INITIAL_ASYNC_TIMEOUT_THRESHOLD if new_task else 1 def _generate_restore_response(self, async_task=None): """ :returns: A file-like object containing response content. """ username = self.restore_user.username count_items = self.params.include_item_count with RestoreContent(username, count_items) as content: for provider in get_element_providers(self.timing_context): with self.timing_context(provider.__class__.__name__): content.extend(provider.get_elements(self.restore_state)) for provider in get_async_providers(self.timing_context, async_task): with self.timing_context(provider.__class__.__name__): provider.extend_response(self.restore_state, content) return content.get_fileobj() def set_cached_payload_if_necessary(self, fileobj, duration, is_async): # only cache if the duration was longer than the threshold is_long_restore = duration > timedelta(seconds=INITIAL_SYNC_CACHE_THRESHOLD) if is_async or self.force_cache or is_long_restore: type_ = 'unknown' if is_async: type_ = 'async' elif self.force_cache: type_ = 'force' elif is_long_restore: type_ = 'long' tags = { 'type:{}'.format(type_), } datadog_counter('commcare.restores.cache_writes', tags=tags) response = CachedResponse.save_for_later( fileobj, self.cache_timeout, self.domain, self.restore_user.user_id, ) self.restore_payload_path_cache.set_value(response.name, self.cache_timeout) return response return None def delete_cached_payload_if_necessary(self): if self.overwrite_cache and self.restore_payload_path_cache.get_value(): self.restore_payload_path_cache.invalidate() def delete_initial_cached_payload_if_necessary(self): if self.sync_log: # Restores are usually cached by there sync token # but initial restores don't have a sync token, # so they're indistinguishable from each other. # Once a user syncs with a sync token, we're sure their initial sync is stale, # so delete it to avoid a stale payload if they (say) wipe the phone and sync again self.initial_restore_payload_path_cache.invalidate() def _record_timing(self, status): timing = self.timing_context assert timing.is_finished() duration = timing.duration if timing is not None else -1 device_id = self.params.device_id if duration > 20 or status == 412: if status == 412: # use last sync log since there is no current sync log sync_log_id = self.params.sync_log_id or 'N/A' else: sync_log = self.restore_state.current_sync_log sync_log_id = sync_log._id if sync_log else 'N/A' log = logging.getLogger(__name__) log.info( "restore %s: user=%s device=%s domain=%s status=%s duration=%.3f", sync_log_id, self.restore_user.username, device_id, self.domain, status, duration, ) is_webapps = device_id and device_id.startswith("WebAppsLogin") tags = [ 'status_code:{}'.format(status), 'device_type:{}'.format('webapps' if is_webapps else 'other'), ] env = settings.SERVER_ENVIRONMENT if (env, self.domain) in settings.RESTORE_TIMING_DOMAINS: tags.append('domain:{}'.format(self.domain)) if timing is not None: timer_buckets = (5, 20, 60, 120) for timer in timing.to_list(exclude_root=True): if timer.name in RESTORE_SEGMENTS: segment = RESTORE_SEGMENTS[timer.name] bucket = bucket_value(timer.duration, timer_buckets, 's') datadog_counter( 'commcare.restores.{}'.format(segment), tags=tags + ['duration:%s' % bucket], ) elif timer.name.startswith('fixture:'): bucket = bucket_value(timer.duration, timer_buckets, 's') datadog_counter( 'commcare.restores.fixture', tags=tags + [ 'duration:%s' % bucket, timer.name, ], ) tags.append('duration:%s' % bucket_value(timing.duration, timer_buckets, 's')) if settings.ENTERPRISE_MODE and self.params.app and self.params.app.copy_of: app_name = slugify(self.params.app.name) tags.append('app:{}-{}'.format(app_name, self.params.app.version)) datadog_counter('commcare.restores.count', tags=tags)