def _record_metrics(tags, submission_type, response, timer=None, xform=None): tags.update({ 'submission_type': submission_type, 'status_code': response.status_code }) if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: lag = xform.received_on - xform.metadata.timeEnd lag_days = lag.total_seconds() / 86400 metrics_histogram('commcare.xform_submissions.lag.days', lag_days, bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', tags=tags) if timer: metrics_histogram('commcare.xform_submissions.duration.seconds', timer.duration, bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', tags=tags) metrics_counter('commcare.xform_submissions.count', tags=tags)
def _report_import_timings(self, timer, results): active_duration = timer.duration - self._total_delayed_duration rows_created = results['created_count'] rows_updated = results['match_count'] rows_failed = results['failed_count'] # Add 1 to smooth / prevent denominator from ever being zero active_duration_per_case = active_duration / ( rows_created + rows_updated + rows_failed + 1) metrics_histogram( 'commcare.case_importer.duration_per_case', active_duration_per_case, buckets=[50, 70, 100, 150, 250, 350, 500], bucket_tag='duration', bucket_unit='ms', ) for rows, status in ((rows_created, 'created'), (rows_updated, 'updated'), (rows_failed, 'error')): metrics_counter('commcare.case_importer.cases', rows, tags={ 'status': status, })
def __call__(self, restore_state): restore_user = restore_state.restore_user global_types = {} user_types = {} for data_type in FixtureDataType.by_domain(restore_user.domain): if data_type.is_global: global_types[data_type._id] = data_type else: user_types[data_type._id] = data_type items = [] user_items_count = 0 if global_types: global_items = self.get_global_items(global_types, restore_state) items.extend(global_items) if user_types: user_items, user_items_count = self.get_user_items_and_count(user_types, restore_user) items.extend(user_items) metrics_histogram( 'commcare.fixtures.item_lists.user', user_items_count, bucket_tag='items', buckets=[1, 100, 1000, 10000, 30000, 100000, 300000, 1000000], bucket_unit='', tags={ 'domain': restore_user.domain } ) return items
def do_livequery(timing_context, restore_state, response, async_task=None): """Get case sync restore response This function makes no changes to external state other than updating the `restore_state.current_sync_log` and progress of `async_task`. Extends `response` with restore elements. """ debug = logging.getLogger(__name__).debug domain = restore_state.domain owner_ids = list(restore_state.owner_ids) debug("sync %s for %r", restore_state.current_sync_log._id, owner_ids) with timing_context("livequery"): with timing_context("get_case_ids_by_owners"): owned_ids = CommCareCase.objects.get_case_ids_in_domain_by_owners( domain, owner_ids, closed=False) debug("owned: %r", owned_ids) live_ids, indices = get_live_case_ids_and_indices( domain, owned_ids, timing_context) if restore_state.last_sync_log: with timing_context("discard_already_synced_cases"): debug('last sync: %s', restore_state.last_sync_log._id) sync_ids = discard_already_synced_cases( live_ids, restore_state) else: sync_ids = live_ids restore_state.current_sync_log.case_ids_on_phone = live_ids with timing_context("compile_response(%s cases)" % len(sync_ids)): iaccessor = PrefetchIndexCaseAccessor(domain, indices) metrics_histogram( 'commcare.restore.case_load', len(sync_ids), 'cases', RESTORE_CASE_LOAD_BUCKETS, tags={ 'domain': domain, 'restore_type': 'incremental' if restore_state.last_sync_log else 'fresh' }) metrics_counter('commcare.restore.case_load.count', len(sync_ids), {'domain': domain}) compile_response( timing_context, restore_state, response, batch_cases(iaccessor, sync_ids), init_progress(async_task, len(sync_ids)), )
def pre_submit_hook(self): if rate_limit_submission(self.domain): # the duration of the last submission is a combined heuristic # for the amount of load on the databases # and the amount of load that the requests from this import put on the databases. # The amount of time to wait, during a high submission period # and while this project is using up more than its fair share # should be proportional to this heuristic. # For a fully throttled domain, this will up to double # the amount of time the case import takes metrics_histogram( 'commcare.case_importer.import_delays', self._last_submission_duration, buckets=[5, 7, 10, 15, 25, 35, 50], bucket_tag='duration', bucket_unit='s', tags={'domain': self.domain} ) self._total_delayed_duration += self._last_submission_duration time.sleep(self._last_submission_duration)
def batch_cases(accessor, case_ids): def take(n, iterable): # https://docs.python.org/2/library/itertools.html#recipes return list(islice(iterable, n)) track_load = case_load_counter("livequery_restore", accessor.domain) metrics_histogram('commcare.restore.case_load', len(case_ids), 'cases', [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000], tags={'domain': accessor.domain}) ids = iter(case_ids) while True: next_ids = take(1000, ids) if not next_ids: break track_load(len(next_ids)) yield accessor.get_cases(next_ids)
def celery_record_time_to_run(task_id=None, task=None, state=None, args=None, kwargs=None, **kw): from corehq.util.metrics import ( DAY_SCALE_TIME_BUCKETS, metrics_counter, metrics_histogram, ) try: domain = get_domain_from_task(task, args, kwargs) except Exception: domain = None logging.exception( "Error while attempting to get the domain for a celery task") metrics_counter('commcare.celery.task.time_to_run_domain_unavailable', tags={ 'celery_task_name': task.name, }) get_task_time_to_start.clear(task_id) tags = { 'celery_task_name': task.name, 'celery_queue': task.queue, 'state': state, 'domain': domain or 'N/A', } timer = TimeToRunTimer(task_id) try: time_to_run = timer.stop_and_pop_timing() except TimingNotAvailable: metrics_counter('commcare.celery.task.time_to_run_unavailable', tags=tags) else: metrics_histogram('commcare.celery.task.time_to_run.seconds', time_to_run.total_seconds(), bucket_tag='duration', buckets=DAY_SCALE_TIME_BUCKETS, bucket_unit='s', tags=tags)
def celery_record_time_to_run(task_id=None, task=None, state=None, **kwargs): from corehq.util.metrics import metrics_counter, metrics_histogram, DAY_SCALE_TIME_BUCKETS get_task_time_to_start.clear(task_id) tags = { 'celery_task_name': task.name, 'celery_queue': task.queue, 'state': state, } timer = TimeToRunTimer(task_id) try: time_to_run = timer.stop_and_pop_timing() except TimingNotAvailable: metrics_counter('commcare.celery.task.time_to_run_unavailable', tags=tags) else: metrics_histogram('commcare.celery.task.time_to_run.seconds', time_to_run.total_seconds(), bucket_tag='duration', buckets=DAY_SCALE_TIME_BUCKETS, bucket_unit='s', tags=tags)
def record_feed_access_in_datadog(request, config_id, duration, response): config = ExportInstance.get(config_id) username = request.couch_user.username json_response = json.loads(response.content.decode('utf-8')) rows = json_response['value'] row_count = len(rows) try: column_count = len(rows[0]) except IndexError: column_count = 0 metrics_histogram( 'commcare.odata_feed.test_v3', duration, bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', tags={ 'domain': request.domain, 'feed_id': config_id, 'feed_type': config.type, 'username': username, 'row_count': row_count, 'column_count': column_count, 'size': len(response.content) } )
def _record_timing(self, status): timing = self.timing_context assert timing.is_finished() duration = timing.duration device_id = self.params.device_id if duration > 20 or status == 412: if status == 412: # use last sync log since there is no current sync log sync_log_id = self.params.sync_log_id or 'N/A' else: sync_log = self.restore_state.current_sync_log sync_log_id = sync_log._id if sync_log else 'N/A' logger.info( "restore %s: user=%s device=%s domain=%s status=%s duration=%.3f", sync_log_id, self.restore_user.username, device_id, self.domain, status, duration, ) is_webapps = device_id and device_id.startswith("WebAppsLogin") tags = { 'status_code': status, 'device_type': 'webapps' if is_webapps else 'other', } maybe_add_domain_tag(self.domain, tags) timer_buckets = (1, 5, 20, 60, 120, 300, 600) for timer in timing.to_list(exclude_root=True): segment = None extra_tags = {} if timer.name in RESTORE_SEGMENTS: segment = RESTORE_SEGMENTS[timer.name] elif timer.name.startswith('fixture:'): segment = 'fixture' extra_tags = {'fixture': timer.name.split(':')[1]} if segment: metrics_histogram( 'commcare.restores.{}.duration.seconds'.format(segment), timer.duration, bucket_tag='duration', buckets=timer_buckets, bucket_unit='s', tags={**tags, **extra_tags} ) metrics_counter( 'commcare.restores.{}'.format(segment), tags={**tags, **extra_tags}, ) tags['type'] = 'sync' if self.params.sync_log_id else 'restore' if settings.ENTERPRISE_MODE: if self.params.app and self.params.app.copy_of: app_name = slugify(self.params.app.name) tags['app'] = '{}-{}'.format(app_name, self.params.app.version) else: tags['app'] = '' metrics_counter('commcare.restores.count', tags=tags) metrics_histogram( 'commcare.restores.duration.seconds', timing.duration, bucket_tag='duration', buckets=timer_buckets, bucket_unit='s', tags=tags )
def do_livequery(timing_context, restore_state, response, async_task=None): """Get case sync restore response This function makes no changes to external state other than updating the `restore_state.current_sync_log` and progress of `async_task`. Extends `response` with restore elements. """ def index_key(index): return '{} {}'.format(index.case_id, index.identifier) def is_extension(case_id): """Determine if case_id is an extension case A case that is both a child and an extension is not an extension. """ return case_id in hosts_by_extension and case_id not in parents_by_child def has_live_extension(case_id, cache={}): """Check if available case_id has a live extension case Do not check for live children because an available parent cannot cause it's children to become live. This is unlike an available host, which can cause its available extension to become live through the recursive rules: - A case is available if - it is open and not an extension case (applies to host). - it is open and is the extension of an available case. - A case is live if it is owned and available. The result is cached to reduce recursion in subsequent calls and to prevent infinite recursion. """ try: return cache[case_id] except KeyError: cache[case_id] = False cache[case_id] = result = any( ext_id in live_ids # has live extension or ext_id in owned_ids # ext is owned and available, will be live or has_live_extension(ext_id) for ext_id in extensions_by_host[case_id]) return result def enliven(case_id): """Mark the given case, its extensions and their hosts as live This closure mutates `live_ids` from the enclosing function. """ if case_id in live_ids: # already live return debug('enliven(%s)', case_id) live_ids.add(case_id) # case is open and is the extension of a live case ext_ids = extensions_by_host.get(case_id, []) # case has live extension host_ids = hosts_by_extension.get(case_id, []) # case has live child parent_ids = parents_by_child.get(case_id, []) for cid in chain(ext_ids, host_ids, parent_ids): enliven(cid) def classify(index, prev_ids): """Classify index as either live or extension with live status pending This closure mutates case graph data structures from the enclosing function. :returns: Case id for next related index fetch or IGNORE if the related case should be ignored. """ sub_id = index.case_id ref_id = index.referenced_id # aka parent/host/super relationship = index.relationship ix_key = index_key(index) if ix_key in seen_ix[sub_id]: return IGNORE # unexpected, don't process duplicate index twice seen_ix[sub_id].add(ix_key) seen_ix[ref_id].add(ix_key) indices[sub_id].append(index) debug("%s --%s--> %s", sub_id, relationship, ref_id) if sub_id in live_ids: # ref has a live child or extension enliven(ref_id) # It does not matter that sub_id -> ref_id never makes it into # hosts_by_extension since both are live and therefore this index # will not need to be traversed in other liveness calculations. elif relationship == EXTENSION: if sub_id in open_ids: if ref_id in live_ids: # sub is open and is the extension of a live case enliven(sub_id) else: # live status pending: # if ref becomes live -> sub is open extension of live case # if sub becomes live -> ref has a live extension extensions_by_host[ref_id].add(sub_id) hosts_by_extension[sub_id].add(ref_id) else: return IGNORE # closed extension elif sub_id in owned_ids: # sub is owned and available (open and not an extension case) enliven(sub_id) # ref has a live child enliven(ref_id) else: # live status pending: if sub becomes live -> ref has a live child parents_by_child[sub_id].add(ref_id) next_id = ref_id if sub_id in prev_ids else sub_id if next_id not in all_ids: return next_id return IGNORE # circular reference def update_open_and_deleted_ids(related): """Update open_ids and deleted_ids with related case_ids TODO store referenced case (parent) deleted and closed status in CommCareCaseIndexSQL to reduce number of related indices fetched and avoid this extra query per related query. """ case_ids = { case_id for index in related for case_id in [index.case_id, index.referenced_id] if case_id not in all_ids } # we know these are open since we filter by closed and deleted when fetching the indexes open_cases = { index.case_id for index in related if index.relationship == 'extension' } check_cases = list(set(case_ids) - open_cases) rows = accessor.get_closed_and_deleted_ids(check_cases) for case_id, closed, deleted in rows: if deleted: deleted_ids.add(case_id) if closed or deleted: case_ids.remove(case_id) open_ids.update(case_ids) IGNORE = object() debug = logging.getLogger(__name__).debug accessor = CaseAccessors(restore_state.domain) # case graph data structures live_ids = set() deleted_ids = set() extensions_by_host = defaultdict(set) # host_id -> (open) extension_ids hosts_by_extension = defaultdict(set) # (open) extension_id -> host_ids parents_by_child = defaultdict(set) # child_id -> parent_ids indices = defaultdict(list) # case_id -> list of CommCareCaseIndex-like seen_ix = defaultdict( set) # case_id -> set of '<index.case_id> <index.identifier>' owner_ids = list(restore_state.owner_ids) debug("sync %s for %r", restore_state.current_sync_log._id, owner_ids) with timing_context("livequery"): with timing_context("get_case_ids_by_owners"): owned_ids = accessor.get_case_ids_by_owners(owner_ids, closed=False) debug("owned: %r", owned_ids) next_ids = all_ids = set(owned_ids) owned_ids = set(owned_ids) # owned, open case ids (may be extensions) open_ids = set(owned_ids) while next_ids: exclude = set(chain.from_iterable(seen_ix[id] for id in next_ids)) with timing_context( "get_related_indices({} cases, {} seen)".format( len(next_ids), len(exclude))): related = accessor.get_related_indices(list(next_ids), exclude) if not related: break update_open_and_deleted_ids(related) next_ids = { classify(index, next_ids) for index in related if index.referenced_id not in deleted_ids and index.case_id not in deleted_ids } next_ids.discard(IGNORE) all_ids.update(next_ids) debug('next: %r', next_ids) with timing_context("enliven open roots (%s cases)" % len(open_ids)): debug('open: %r', open_ids) # owned, open, not an extension -> live for case_id in owned_ids: if not is_extension(case_id): enliven(case_id) # available case with live extension -> live for case_id in open_ids: if (case_id not in live_ids and not is_extension(case_id) and has_live_extension(case_id)): enliven(case_id) debug('live: %r', live_ids) if restore_state.last_sync_log: with timing_context("discard_already_synced_cases"): debug('last sync: %s', restore_state.last_sync_log._id) sync_ids = discard_already_synced_cases( live_ids, restore_state, accessor) else: sync_ids = live_ids restore_state.current_sync_log.case_ids_on_phone = live_ids with timing_context("compile_response(%s cases)" % len(sync_ids)): iaccessor = PrefetchIndexCaseAccessor(accessor, indices) metrics_histogram( 'commcare.restore.case_load', len(sync_ids), 'cases', RESTORE_CASE_LOAD_BUCKETS, tags={ 'domain': accessor.domain, 'restore_type': 'incremental' if restore_state.last_sync_log else 'fresh' }) compile_response( timing_context, restore_state, response, batch_cases(iaccessor, sync_ids), init_progress(async_task, len(sync_ids)), )