Example #1
0
def _record_metrics(tags, submission_type, response, timer=None, xform=None):
    tags.update({
        'submission_type': submission_type,
        'status_code': response.status_code
    })

    if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
        lag = xform.received_on - xform.metadata.timeEnd
        lag_days = lag.total_seconds() / 86400
        metrics_histogram('commcare.xform_submissions.lag.days',
                          lag_days,
                          bucket_tag='lag',
                          buckets=(1, 2, 4, 7, 14, 31, 90),
                          bucket_unit='d',
                          tags=tags)

    if timer:
        metrics_histogram('commcare.xform_submissions.duration.seconds',
                          timer.duration,
                          bucket_tag='duration',
                          buckets=(1, 5, 20, 60, 120, 300, 600),
                          bucket_unit='s',
                          tags=tags)

    metrics_counter('commcare.xform_submissions.count', tags=tags)
Example #2
0
    def _report_import_timings(self, timer, results):
        active_duration = timer.duration - self._total_delayed_duration
        rows_created = results['created_count']
        rows_updated = results['match_count']
        rows_failed = results['failed_count']

        # Add 1 to smooth / prevent denominator from ever being zero
        active_duration_per_case = active_duration / (
            rows_created + rows_updated + rows_failed + 1)
        metrics_histogram(
            'commcare.case_importer.duration_per_case',
            active_duration_per_case,
            buckets=[50, 70, 100, 150, 250, 350, 500],
            bucket_tag='duration',
            bucket_unit='ms',
        )

        for rows, status in ((rows_created, 'created'),
                             (rows_updated, 'updated'), (rows_failed,
                                                         'error')):
            metrics_counter('commcare.case_importer.cases',
                            rows,
                            tags={
                                'status': status,
                            })
Example #3
0
    def __call__(self, restore_state):
        restore_user = restore_state.restore_user
        global_types = {}
        user_types = {}
        for data_type in FixtureDataType.by_domain(restore_user.domain):
            if data_type.is_global:
                global_types[data_type._id] = data_type
            else:
                user_types[data_type._id] = data_type
        items = []
        user_items_count = 0
        if global_types:
            global_items = self.get_global_items(global_types, restore_state)
            items.extend(global_items)
        if user_types:
            user_items, user_items_count = self.get_user_items_and_count(user_types, restore_user)
            items.extend(user_items)

        metrics_histogram(
            'commcare.fixtures.item_lists.user',
            user_items_count,
            bucket_tag='items',
            buckets=[1, 100, 1000, 10000, 30000, 100000, 300000, 1000000],
            bucket_unit='',
            tags={
                'domain': restore_user.domain
            }
        )
        return items
Example #4
0
def do_livequery(timing_context, restore_state, response, async_task=None):
    """Get case sync restore response

    This function makes no changes to external state other than updating
    the `restore_state.current_sync_log` and progress of `async_task`.
    Extends `response` with restore elements.
    """

    debug = logging.getLogger(__name__).debug
    domain = restore_state.domain
    owner_ids = list(restore_state.owner_ids)

    debug("sync %s for %r", restore_state.current_sync_log._id, owner_ids)
    with timing_context("livequery"):
        with timing_context("get_case_ids_by_owners"):
            owned_ids = CommCareCase.objects.get_case_ids_in_domain_by_owners(
                domain, owner_ids, closed=False)
            debug("owned: %r", owned_ids)

        live_ids, indices = get_live_case_ids_and_indices(
            domain, owned_ids, timing_context)

        if restore_state.last_sync_log:
            with timing_context("discard_already_synced_cases"):
                debug('last sync: %s', restore_state.last_sync_log._id)
                sync_ids = discard_already_synced_cases(
                    live_ids, restore_state)
        else:
            sync_ids = live_ids
        restore_state.current_sync_log.case_ids_on_phone = live_ids

        with timing_context("compile_response(%s cases)" % len(sync_ids)):
            iaccessor = PrefetchIndexCaseAccessor(domain, indices)
            metrics_histogram(
                'commcare.restore.case_load',
                len(sync_ids),
                'cases',
                RESTORE_CASE_LOAD_BUCKETS,
                tags={
                    'domain':
                    domain,
                    'restore_type':
                    'incremental' if restore_state.last_sync_log else 'fresh'
                })
            metrics_counter('commcare.restore.case_load.count', len(sync_ids),
                            {'domain': domain})
            compile_response(
                timing_context,
                restore_state,
                response,
                batch_cases(iaccessor, sync_ids),
                init_progress(async_task, len(sync_ids)),
            )
 def pre_submit_hook(self):
     if rate_limit_submission(self.domain):
         # the duration of the last submission is a combined heuristic
         # for the amount of load on the databases
         # and the amount of load that the requests from this import put on the databases.
         # The amount of time to wait, during a high submission period
         # and while this project is using up more than its fair share
         # should be proportional to this heuristic.
         # For a fully throttled domain, this will up to double
         # the amount of time the case import takes
         metrics_histogram(
             'commcare.case_importer.import_delays', self._last_submission_duration,
             buckets=[5, 7, 10, 15, 25, 35, 50], bucket_tag='duration', bucket_unit='s',
             tags={'domain': self.domain}
         )
         self._total_delayed_duration += self._last_submission_duration
         time.sleep(self._last_submission_duration)
Example #6
0
def batch_cases(accessor, case_ids):
    def take(n, iterable):
        # https://docs.python.org/2/library/itertools.html#recipes
        return list(islice(iterable, n))

    track_load = case_load_counter("livequery_restore", accessor.domain)
    metrics_histogram('commcare.restore.case_load',
                      len(case_ids),
                      'cases',
                      [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000],
                      tags={'domain': accessor.domain})
    ids = iter(case_ids)
    while True:
        next_ids = take(1000, ids)
        if not next_ids:
            break
        track_load(len(next_ids))
        yield accessor.get_cases(next_ids)
Example #7
0
def celery_record_time_to_run(task_id=None,
                              task=None,
                              state=None,
                              args=None,
                              kwargs=None,
                              **kw):
    from corehq.util.metrics import (
        DAY_SCALE_TIME_BUCKETS,
        metrics_counter,
        metrics_histogram,
    )

    try:
        domain = get_domain_from_task(task, args, kwargs)
    except Exception:
        domain = None
        logging.exception(
            "Error while attempting to get the domain for a celery task")
        metrics_counter('commcare.celery.task.time_to_run_domain_unavailable',
                        tags={
                            'celery_task_name': task.name,
                        })

    get_task_time_to_start.clear(task_id)

    tags = {
        'celery_task_name': task.name,
        'celery_queue': task.queue,
        'state': state,
        'domain': domain or 'N/A',
    }
    timer = TimeToRunTimer(task_id)
    try:
        time_to_run = timer.stop_and_pop_timing()
    except TimingNotAvailable:
        metrics_counter('commcare.celery.task.time_to_run_unavailable',
                        tags=tags)
    else:
        metrics_histogram('commcare.celery.task.time_to_run.seconds',
                          time_to_run.total_seconds(),
                          bucket_tag='duration',
                          buckets=DAY_SCALE_TIME_BUCKETS,
                          bucket_unit='s',
                          tags=tags)
Example #8
0
def celery_record_time_to_run(task_id=None, task=None, state=None, **kwargs):
    from corehq.util.metrics import metrics_counter, metrics_histogram, DAY_SCALE_TIME_BUCKETS

    get_task_time_to_start.clear(task_id)

    tags = {
        'celery_task_name': task.name,
        'celery_queue': task.queue,
        'state': state,
    }
    timer = TimeToRunTimer(task_id)
    try:
        time_to_run = timer.stop_and_pop_timing()
    except TimingNotAvailable:
        metrics_counter('commcare.celery.task.time_to_run_unavailable',
                        tags=tags)
    else:
        metrics_histogram('commcare.celery.task.time_to_run.seconds',
                          time_to_run.total_seconds(),
                          bucket_tag='duration',
                          buckets=DAY_SCALE_TIME_BUCKETS,
                          bucket_unit='s',
                          tags=tags)
Example #9
0
def record_feed_access_in_datadog(request, config_id, duration, response):
    config = ExportInstance.get(config_id)
    username = request.couch_user.username
    json_response = json.loads(response.content.decode('utf-8'))
    rows = json_response['value']
    row_count = len(rows)
    try:
        column_count = len(rows[0])
    except IndexError:
        column_count = 0
    metrics_histogram(
        'commcare.odata_feed.test_v3', duration,
        bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
        tags={
            'domain': request.domain,
            'feed_id': config_id,
            'feed_type': config.type,
            'username': username,
            'row_count': row_count,
            'column_count': column_count,
            'size': len(response.content)
        }
    )
Example #10
0
    def _record_timing(self, status):
        timing = self.timing_context
        assert timing.is_finished()
        duration = timing.duration
        device_id = self.params.device_id
        if duration > 20 or status == 412:
            if status == 412:
                # use last sync log since there is no current sync log
                sync_log_id = self.params.sync_log_id or 'N/A'
            else:
                sync_log = self.restore_state.current_sync_log
                sync_log_id = sync_log._id if sync_log else 'N/A'
            logger.info(
                "restore %s: user=%s device=%s domain=%s status=%s duration=%.3f",
                sync_log_id,
                self.restore_user.username,
                device_id,
                self.domain,
                status,
                duration,
            )
        is_webapps = device_id and device_id.startswith("WebAppsLogin")
        tags = {
            'status_code': status,
            'device_type': 'webapps' if is_webapps else 'other',
        }
        maybe_add_domain_tag(self.domain, tags)
        timer_buckets = (1, 5, 20, 60, 120, 300, 600)
        for timer in timing.to_list(exclude_root=True):
            segment = None
            extra_tags = {}
            if timer.name in RESTORE_SEGMENTS:
                segment = RESTORE_SEGMENTS[timer.name]
            elif timer.name.startswith('fixture:'):
                segment = 'fixture'
                extra_tags = {'fixture': timer.name.split(':')[1]}

            if segment:
                metrics_histogram(
                    'commcare.restores.{}.duration.seconds'.format(segment), timer.duration,
                    bucket_tag='duration', buckets=timer_buckets, bucket_unit='s',
                    tags={**tags, **extra_tags}
                )
                metrics_counter(
                    'commcare.restores.{}'.format(segment),
                    tags={**tags, **extra_tags},
                )

        tags['type'] = 'sync' if self.params.sync_log_id else 'restore'

        if settings.ENTERPRISE_MODE:
            if self.params.app and self.params.app.copy_of:
                app_name = slugify(self.params.app.name)
                tags['app'] = '{}-{}'.format(app_name, self.params.app.version)
            else:
                tags['app'] = ''

        metrics_counter('commcare.restores.count', tags=tags)
        metrics_histogram(
            'commcare.restores.duration.seconds', timing.duration,
            bucket_tag='duration', buckets=timer_buckets, bucket_unit='s',
            tags=tags
        )
Example #11
0
def do_livequery(timing_context, restore_state, response, async_task=None):
    """Get case sync restore response

    This function makes no changes to external state other than updating
    the `restore_state.current_sync_log` and progress of `async_task`.
    Extends `response` with restore elements.
    """
    def index_key(index):
        return '{} {}'.format(index.case_id, index.identifier)

    def is_extension(case_id):
        """Determine if case_id is an extension case

        A case that is both a child and an extension is not an extension.
        """
        return case_id in hosts_by_extension and case_id not in parents_by_child

    def has_live_extension(case_id, cache={}):
        """Check if available case_id has a live extension case

        Do not check for live children because an available parent
        cannot cause it's children to become live. This is unlike an
        available host, which can cause its available extension to
        become live through the recursive rules:

        - A case is available if
            - it is open and not an extension case (applies to host).
            - it is open and is the extension of an available case.
        - A case is live if it is owned and available.

        The result is cached to reduce recursion in subsequent calls
        and to prevent infinite recursion.
        """
        try:
            return cache[case_id]
        except KeyError:
            cache[case_id] = False
        cache[case_id] = result = any(
            ext_id in live_ids  # has live extension
            or ext_id in owned_ids  # ext is owned and available, will be live
            or has_live_extension(ext_id)
            for ext_id in extensions_by_host[case_id])
        return result

    def enliven(case_id):
        """Mark the given case, its extensions and their hosts as live

        This closure mutates `live_ids` from the enclosing function.
        """
        if case_id in live_ids:
            # already live
            return
        debug('enliven(%s)', case_id)
        live_ids.add(case_id)
        # case is open and is the extension of a live case
        ext_ids = extensions_by_host.get(case_id, [])
        # case has live extension
        host_ids = hosts_by_extension.get(case_id, [])
        # case has live child
        parent_ids = parents_by_child.get(case_id, [])
        for cid in chain(ext_ids, host_ids, parent_ids):
            enliven(cid)

    def classify(index, prev_ids):
        """Classify index as either live or extension with live status pending

        This closure mutates case graph data structures from the
        enclosing function.

        :returns: Case id for next related index fetch or IGNORE
        if the related case should be ignored.
        """
        sub_id = index.case_id
        ref_id = index.referenced_id  # aka parent/host/super
        relationship = index.relationship
        ix_key = index_key(index)
        if ix_key in seen_ix[sub_id]:
            return IGNORE  # unexpected, don't process duplicate index twice
        seen_ix[sub_id].add(ix_key)
        seen_ix[ref_id].add(ix_key)
        indices[sub_id].append(index)
        debug("%s --%s--> %s", sub_id, relationship, ref_id)
        if sub_id in live_ids:
            # ref has a live child or extension
            enliven(ref_id)
            # It does not matter that sub_id -> ref_id never makes it into
            # hosts_by_extension since both are live and therefore this index
            # will not need to be traversed in other liveness calculations.
        elif relationship == EXTENSION:
            if sub_id in open_ids:
                if ref_id in live_ids:
                    # sub is open and is the extension of a live case
                    enliven(sub_id)
                else:
                    # live status pending:
                    # if ref becomes live -> sub is open extension of live case
                    # if sub becomes live -> ref has a live extension
                    extensions_by_host[ref_id].add(sub_id)
                    hosts_by_extension[sub_id].add(ref_id)
            else:
                return IGNORE  # closed extension
        elif sub_id in owned_ids:
            # sub is owned and available (open and not an extension case)
            enliven(sub_id)
            # ref has a live child
            enliven(ref_id)
        else:
            # live status pending: if sub becomes live -> ref has a live child
            parents_by_child[sub_id].add(ref_id)

        next_id = ref_id if sub_id in prev_ids else sub_id
        if next_id not in all_ids:
            return next_id
        return IGNORE  # circular reference

    def update_open_and_deleted_ids(related):
        """Update open_ids and deleted_ids with related case_ids

        TODO store referenced case (parent) deleted and closed status in
        CommCareCaseIndexSQL to reduce number of related indices fetched
        and avoid this extra query per related query.
        """
        case_ids = {
            case_id
            for index in related
            for case_id in [index.case_id, index.referenced_id]
            if case_id not in all_ids
        }

        # we know these are open since we filter by closed and deleted when fetching the indexes
        open_cases = {
            index.case_id
            for index in related if index.relationship == 'extension'
        }
        check_cases = list(set(case_ids) - open_cases)
        rows = accessor.get_closed_and_deleted_ids(check_cases)
        for case_id, closed, deleted in rows:
            if deleted:
                deleted_ids.add(case_id)
            if closed or deleted:
                case_ids.remove(case_id)
        open_ids.update(case_ids)

    IGNORE = object()
    debug = logging.getLogger(__name__).debug
    accessor = CaseAccessors(restore_state.domain)

    # case graph data structures
    live_ids = set()
    deleted_ids = set()
    extensions_by_host = defaultdict(set)  # host_id -> (open) extension_ids
    hosts_by_extension = defaultdict(set)  # (open) extension_id -> host_ids
    parents_by_child = defaultdict(set)  # child_id -> parent_ids
    indices = defaultdict(list)  # case_id -> list of CommCareCaseIndex-like
    seen_ix = defaultdict(
        set)  # case_id -> set of '<index.case_id> <index.identifier>'
    owner_ids = list(restore_state.owner_ids)

    debug("sync %s for %r", restore_state.current_sync_log._id, owner_ids)
    with timing_context("livequery"):
        with timing_context("get_case_ids_by_owners"):
            owned_ids = accessor.get_case_ids_by_owners(owner_ids,
                                                        closed=False)
            debug("owned: %r", owned_ids)

        next_ids = all_ids = set(owned_ids)
        owned_ids = set(owned_ids)  # owned, open case ids (may be extensions)
        open_ids = set(owned_ids)
        while next_ids:
            exclude = set(chain.from_iterable(seen_ix[id] for id in next_ids))
            with timing_context(
                    "get_related_indices({} cases, {} seen)".format(
                        len(next_ids), len(exclude))):
                related = accessor.get_related_indices(list(next_ids), exclude)
                if not related:
                    break
                update_open_and_deleted_ids(related)
                next_ids = {
                    classify(index, next_ids)
                    for index in related
                    if index.referenced_id not in deleted_ids
                    and index.case_id not in deleted_ids
                }
                next_ids.discard(IGNORE)
                all_ids.update(next_ids)
                debug('next: %r', next_ids)

        with timing_context("enliven open roots (%s cases)" % len(open_ids)):
            debug('open: %r', open_ids)
            # owned, open, not an extension -> live
            for case_id in owned_ids:
                if not is_extension(case_id):
                    enliven(case_id)

            # available case with live extension -> live
            for case_id in open_ids:
                if (case_id not in live_ids and not is_extension(case_id)
                        and has_live_extension(case_id)):
                    enliven(case_id)

            debug('live: %r', live_ids)

        if restore_state.last_sync_log:
            with timing_context("discard_already_synced_cases"):
                debug('last sync: %s', restore_state.last_sync_log._id)
                sync_ids = discard_already_synced_cases(
                    live_ids, restore_state, accessor)
        else:
            sync_ids = live_ids
        restore_state.current_sync_log.case_ids_on_phone = live_ids

        with timing_context("compile_response(%s cases)" % len(sync_ids)):
            iaccessor = PrefetchIndexCaseAccessor(accessor, indices)
            metrics_histogram(
                'commcare.restore.case_load',
                len(sync_ids),
                'cases',
                RESTORE_CASE_LOAD_BUCKETS,
                tags={
                    'domain':
                    accessor.domain,
                    'restore_type':
                    'incremental' if restore_state.last_sync_log else 'fresh'
                })
            compile_response(
                timing_context,
                restore_state,
                response,
                batch_cases(iaccessor, sync_ids),
                init_progress(async_task, len(sync_ids)),
            )